Update to LLVM 3.5a.

Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617
author: Stephen Hines <srhines@google.com> 2014-04-23 16:57:46 -0700
committer: Stephen Hines <srhines@google.com> 2014-04-24 15:53:16 -0700
commit: 36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree: e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /lib/Target
parent: 69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download: external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2
827 files changed, 108514 insertions, 23827 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 4de4faa..0297de1 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -33,6 +33,10 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64BranchFixupPass();
 
+/// \brief Creates an AArch64-specific Target Transformation Info pass.
+ImmutablePass *createAArch64TargetTransformInfoPass(
+                                                const AArch64TargetMachine *TM);
+
 void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                       AArch64AsmPrinter &AP);
 
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 9c2c69a..e49afd6 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -36,7 +36,25 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 
 include "AArch64Schedule.td"
 
-def : Processor<"generic", GenericItineraries, [FeatureFPARMv8]>;
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Processor<"generic", GenericItineraries, [FeatureFPARMv8, FeatureNEON]>;
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto]>;
+
+def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
+def : Processor<"cortex-a57", NoItineraries, [ProcA57]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -52,15 +70,8 @@ include "AArch64CallingConv.td"
 
 include "AArch64InstrInfo.td"
 
-def AArch64InstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-
-def A64InstPrinter : AsmWriter {
-  string AsmWriterClassName = "InstPrinter";
-  bit isMCAsmWriter = 1;
+def AArch64InstrInfo : InstrInfo {
+  let noNamedPositionallyEncodedOperands = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -69,5 +80,4 @@ def A64InstPrinter : AsmWriter {
 
 def AArch64 : Target {
   let InstructionSet = AArch64InstrInfo;
-  let AssemblyWriters = [A64InstPrinter];
 }
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index d59ca56..f0b52d3 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -15,15 +15,15 @@
 #define DEBUG_TYPE "asm-printer"
 #include "AArch64AsmPrinter.h"
 #include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
@@ -114,9 +114,6 @@ bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
   case MachineOperand::MO_BlockAddress:
     Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
     break;
-  case MachineOperand::MO_ExternalSymbol:
-    Name = MO.getSymbolName();
-    break;
   case MachineOperand::MO_ConstantPoolIndex:
     Name = GetCPISymbol(MO.getIndex())->getName();
     break;
@@ -238,7 +235,6 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
   case MachineOperand::MO_BlockAddress:
   case MachineOperand::MO_ConstantPoolIndex:
   case MachineOperand::MO_GlobalAddress:
-  case MachineOperand::MO_ExternalSymbol:
     return printSymbolicAddress(MO, false, "", O);
   }
 
@@ -268,7 +264,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   MCInst TmpInst;
   LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
-  OutStreamer.EmitInstruction(TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
@@ -300,6 +296,7 @@ bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
 // Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
-    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64Target);
+    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+    RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
 }
 
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
index 11e7f41..c03cdde 100644
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
@@ -16,13 +16,13 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
@@ -238,10 +238,10 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
   // Can't fall off end of function.
-  if (llvm::next(MBBI) == MBB->getParent()->end())
+  if (std::next(MBBI) == MBB->getParent()->end())
     return false;
 
-  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  MachineBasicBlock *NextBB = std::next(MBBI);
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I)
     if (*I == NextBB)
@@ -528,7 +528,7 @@ AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
 
   ++NumCBrFixed;
   if (BMI != MI) {
-    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
         BMI->getOpcode() == AArch64::Bimm) {
       // Last MI in the BB is an unconditional branch. We can swap destinations:
       // b.eq L1 (temporarily b.ne L1 after first change)
@@ -575,7 +575,7 @@ AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
   //   b L1
   // splitbb/fallthroughbb:
   //   [old b L2/real continuation]
-  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
 
   DEBUG(dbgs() << "  Insert B to BB#"
                << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
index a2a9f3f..9fe6aae 100644
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ b/lib/Target/AArch64/AArch64CallingConv.td
@@ -60,7 +60,7 @@ def CC_A64_APCS : CallingConv<[
   // registers. This makes sense because the PCS does not distinguish Short
   // Vectors and Floating-point types.
   CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v1i32, v4i8, v2i16, v1f32], CCBitConvertToType<f32>>,
+  CCIfType<[v1i32, v4i8, v2i16], CCBitConvertToType<f32>>,
   CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
            CCBitConvertToType<f128>>,
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7318230..b29587a 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -13,8 +13,8 @@
 
 #include "AArch64.h"
 #include "AArch64FrameLowering.h"
-#include "AArch64MachineFunctionInfo.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -92,14 +92,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (NeedsFrameMoves && NumInitialBytes) {
     // We emit this update even if the CFA is set from a frame pointer later so
     // that the CFA is valid in the interim.
-    MCSymbol *SPLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
-      .addSym(SPLabel);
-
     MachineLocation Dst(MachineLocation::VirtualFP);
     unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    MMI.addFrameInst(
-        MCCFIInstruction::createDefCfa(SPLabel, Reg, -NumInitialBytes));
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfa(nullptr, Reg, -NumInitialBytes));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
   }
 
   // Otherwise we need to set the frame pointer and/or add a second stack
@@ -129,12 +127,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
                                - MFI->getStackSize());
 
       if (NeedsFrameMoves) {
-        MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol();
-        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
-          .addSym(FPLabel);
         unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true);
         unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
-        MMI.addFrameInst(MCCFIInstruction::createDefCfa(FPLabel, Reg, Offset));
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createDefCfa(nullptr, Reg, Offset));
+        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
       }
 
       FPNeedsSetting = false;
@@ -155,36 +153,29 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (!NeedsFrameMoves)
     return;
 
-  // Reuse the label if appropriate, so create it in this outer scope.
-  MCSymbol *CSLabel = 0;
-
   // The rest of the stack adjustment
   if (!hasFP(MF) && NumResidualBytes) {
-    CSLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
-      .addSym(CSLabel);
-
     MachineLocation Dst(MachineLocation::VirtualFP);
     unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
     unsigned Offset = NumResidualBytes + NumInitialBytes;
-    MMI.addFrameInst(MCCFIInstruction::createDefCfa(CSLabel, Reg, -Offset));
+    unsigned CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
   }
 
   // And any callee-saved registers (it's fine to leave them to the end here,
   // because the old values are still valid at this point.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   if (CSI.size()) {
-    if (!CSLabel) {
-      CSLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
-        .addSym(CSLabel);
-    }
-
     for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
            E = CSI.end(); I != E; ++I) {
       unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
       unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
-      MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, Reg, Offset));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
   }
 }
@@ -237,7 +228,7 @@ AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
 
     // Delete the pseudo instruction TC_RETURN.
-    MachineInstr *NewMI = prior(MBBI);
+    MachineInstr *NewMI = std::prev(MBBI);
     MBB.erase(MBBI);
     MBBI = NewMI;
 
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ef99541..dac4b32 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1113,15 +1113,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
                                 TFI, CurDAG->getTargetConstant(0, PtrTy));
   }
-  case ISD::ConstantPool: {
-    // Constant pools are fine, just create a Target entry.
-    ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Node);
-    const Constant *C = CN->getConstVal();
-    SDValue CP = CurDAG->getTargetConstantPool(C, CN->getValueType(0));
-
-    ReplaceUses(SDValue(Node, 0), CP);
-    return NULL;
-  }
   case ISD::Constant: {
     SDNode *ResNode = 0;
     if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4fdb667..388973a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26,17 +26,14 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
 static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
-  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-
-  if (Subtarget->isTargetLinux())
-    return new AArch64LinuxTargetObjectFile();
-  if (Subtarget->isTargetELF())
-    return new TargetLoweringObjectFileELF();
-  llvm_unreachable("unknown subtarget type");
+  assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
+          "unknown subtarget type");
+  return new AArch64ElfTargetObjectFile();
 }
 
 AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
@@ -64,7 +61,6 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
     addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass);
     addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
     addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
     addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
@@ -141,6 +137,7 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
@@ -155,6 +152,11 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
@@ -267,6 +269,11 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
 
+  // i128 shift operation support
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+
   // This prevents LLVM trying to compress double constants into a floating
   // constant-pool entry and trying to load from there. It's of doubtful benefit
   // for A64: we'd need LDR followed by FCVT, I believe.
@@ -285,6 +292,15 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setExceptionSelectorRegister(AArch64::X1);
 
   if (Subtarget->hasNEON()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand);
+
     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
@@ -296,7 +312,6 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
@@ -315,16 +330,20 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
 
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
 
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i8, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+
     setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
     setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
     setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
@@ -333,7 +352,6 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
     setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
     setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
@@ -368,7 +386,161 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
     setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i8, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i8, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i8, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i8, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Custom);
+
+    // Neon does not support vector divide/remainder operations except
+    // floating-point divide.
+    setOperationAction(ISD::SDIV, MVT::v1i8, Expand);
+    setOperationAction(ISD::SDIV, MVT::v8i8, Expand);
+    setOperationAction(ISD::SDIV, MVT::v16i8, Expand);
+    setOperationAction(ISD::SDIV, MVT::v1i16, Expand);
+    setOperationAction(ISD::SDIV, MVT::v4i16, Expand);
+    setOperationAction(ISD::SDIV, MVT::v8i16, Expand);
+    setOperationAction(ISD::SDIV, MVT::v1i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::v2i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::v4i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::v1i64, Expand);
+    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+
+    setOperationAction(ISD::UDIV, MVT::v1i8, Expand);
+    setOperationAction(ISD::UDIV, MVT::v8i8, Expand);
+    setOperationAction(ISD::UDIV, MVT::v16i8, Expand);
+    setOperationAction(ISD::UDIV, MVT::v1i16, Expand);
+    setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
+    setOperationAction(ISD::UDIV, MVT::v8i16, Expand);
+    setOperationAction(ISD::UDIV, MVT::v1i32, Expand);
+    setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
+    setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+    setOperationAction(ISD::UDIV, MVT::v1i64, Expand);
+    setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
+
+    setOperationAction(ISD::SREM, MVT::v1i8, Expand);
+    setOperationAction(ISD::SREM, MVT::v8i8, Expand);
+    setOperationAction(ISD::SREM, MVT::v16i8, Expand);
+    setOperationAction(ISD::SREM, MVT::v1i16, Expand);
+    setOperationAction(ISD::SREM, MVT::v4i16, Expand);
+    setOperationAction(ISD::SREM, MVT::v8i16, Expand);
+    setOperationAction(ISD::SREM, MVT::v1i32, Expand);
+    setOperationAction(ISD::SREM, MVT::v2i32, Expand);
+    setOperationAction(ISD::SREM, MVT::v4i32, Expand);
+    setOperationAction(ISD::SREM, MVT::v1i64, Expand);
+    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+
+    setOperationAction(ISD::UREM, MVT::v1i8, Expand);
+    setOperationAction(ISD::UREM, MVT::v8i8, Expand);
+    setOperationAction(ISD::UREM, MVT::v16i8, Expand);
+    setOperationAction(ISD::UREM, MVT::v1i16, Expand);
+    setOperationAction(ISD::UREM, MVT::v4i16, Expand);
+    setOperationAction(ISD::UREM, MVT::v8i16, Expand);
+    setOperationAction(ISD::UREM, MVT::v1i32, Expand);
+    setOperationAction(ISD::UREM, MVT::v2i32, Expand);
+    setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+    setOperationAction(ISD::UREM, MVT::v1i64, Expand);
+    setOperationAction(ISD::UREM, MVT::v2i64, Expand);
+
+    setOperationAction(ISD::FREM, MVT::v2f32, Expand);
+    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+
+    setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
+    setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
+    setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
+
+    setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
+
+    // Vector ExtLoad and TruncStore are expanded.
+    for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
+         I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
+      MVT VT = (MVT::SimpleValueType) I;
+      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+      for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
+           II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
+        MVT VT1 = (MVT::SimpleValueType) II;
+        // A TruncStore has two vector types of the same number of elements
+        // and different element sizes.
+        if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
+            VT.getVectorElementType().getSizeInBits()
+                > VT1.getVectorElementType().getSizeInBits())
+          setTruncStoreAction(VT, VT1, Expand);
+      }
+    }
+
+    // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
+    // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
+    // and then copy back to VPR. This solution may be optimized by Following 3
+    // NEON instructions:
+    //        pmull  v2.1q, v0.1d, v1.1d
+    //        pmull2 v3.1q, v0.2d, v1.2d
+    //        ins    v2.d[1], v3.d[0]
+    // As currently we can't verify the correctness of such assumption, we can
+    // do such optimization in the future.
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+
+    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
   }
+
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::VSELECT);
 }
 
 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
@@ -462,8 +634,7 @@ AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC
@@ -557,8 +728,7 @@ AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   unsigned scratch = MRI.createVirtualRegister(TRC);
@@ -641,8 +811,7 @@ AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
@@ -733,8 +902,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   MF->insert(It, EndBB);
 
   // Transfer rest of current basic-block to EndBB
-  EndBB->splice(EndBB->begin(), MBB,
-                llvm::next(MachineBasicBlock::iterator(MI)),
+  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
                 MBB->end());
   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 
@@ -922,8 +1090,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
   case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
 
-  case AArch64ISD::NEON_BSL:
-    return "AArch64ISD::NEON_BSL";
   case AArch64ISD::NEON_MOVIMM:
     return "AArch64ISD::NEON_MOVIMM";
   case AArch64ISD::NEON_MVNIMM:
@@ -1118,7 +1284,8 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
     FuncInfo->setVariadicFPRSize(FPRSaveSize);
   }
 
-  int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
+  unsigned StackOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), 8);
+  int StackIdx = MFI->CreateFixedObject(8, StackOffset, true);
 
   FuncInfo->setVariadicStackIdx(StackIdx);
   FuncInfo->setVariadicGPRIdx(GPRIdx);
@@ -1162,8 +1329,11 @@ AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
       int Size = Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
+      uint32_t BEAlign = 0;
+      if (Size < 8 && !getSubtarget()->isLittle())
+        BEAlign = 8-Size;
       unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
-                                                 VA.getLocMemOffset(),
+                                                 VA.getLocMemOffset() + BEAlign,
                                                  false);
       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
       InVals.push_back(FrameIdxN);
@@ -1197,7 +1367,8 @@ AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
       break;
     case CCValAssign::SExt:
     case CCValAssign::ZExt:
-    case CCValAssign::AExt: {
+    case CCValAssign::AExt:
+    case CCValAssign::FPExt: {
       unsigned DestSize = VA.getValVT().getSizeInBits();
       unsigned DestSubReg;
 
@@ -1319,6 +1490,12 @@ AArch64TargetLowering::LowerReturn(SDValue Chain,
                      &RetOps[0], RetOps.size());
 }
 
+unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
+  // This is a new backend. For anything more precise than this a FE should
+  // set an explicit alignment.
+  return 4;
+}
+
 SDValue
 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
@@ -1412,7 +1589,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
     case CCValAssign::ZExt:
-    case CCValAssign::AExt: {
+    case CCValAssign::AExt:
+    case CCValAssign::FPExt: {
       unsigned SrcSize = VA.getValVT().getSizeInBits();
       unsigned SrcSubReg;
 
@@ -1464,7 +1642,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       // loaded before this eventual operation. Otherwise they'll be clobbered.
       Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
     } else {
-      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
+      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize()*8 :
+                                          VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      uint32_t BEAlign = 0;
+      if (OpSize < 8 && !getSubtarget()->isLittle())
+        BEAlign = 8-OpSize;
+      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + BEAlign);
 
       DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
       DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
@@ -2076,9 +2260,89 @@ AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   return LowerF128ToCall(Op, DAG, LC);
 }
 
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                    bool IsSigned) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Vec = Op.getOperand(0);
+  EVT OpVT = Vec.getValueType();
+  unsigned Opc = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+
+  if (VT.getVectorNumElements() == 1) {
+    assert(OpVT == MVT::v1f64 && "Unexpected vector type!");
+    if (VT.getSizeInBits() == OpVT.getSizeInBits())
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  if (VT.getSizeInBits() > OpVT.getSizeInBits()) {
+    assert(Vec.getValueType() == MVT::v2f32 && VT == MVT::v2i64 &&
+           "Unexpected vector type!");
+    Vec = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Vec);
+    return DAG.getNode(Opc, dl, VT, Vec);
+  } else if (VT.getSizeInBits() < OpVT.getSizeInBits()) {
+    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
+                                   OpVT.getVectorElementType().getSizeInBits());
+    CastVT =
+        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
+    Vec = DAG.getNode(Opc, dl, CastVT, Vec);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Vec);
+  }
+  return DAG.getNode(Opc, dl, VT, Vec);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // We custom lower concat_vectors with 4, 8, or 16 operands that are all the
+  // same operand and of type v1* using the DUP instruction.
+  unsigned NumOps = Op->getNumOperands();
+  if (NumOps == 2) {
+    assert(Op.getValueType().getSizeInBits() == 128 && "unexpected concat");
+    return Op;
+  }
+
+  if (NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return SDValue();
+
+  // Must be a single value for VDUP.
+  SDValue Op0 = Op.getOperand(0);
+  for (unsigned i = 1; i < NumOps; ++i) {
+    SDValue OpN = Op.getOperand(i);
+    if (Op0 != OpN)
+      return SDValue();
+  }
+
+  // Verify the value type.
+  EVT EltVT = Op0.getValueType();
+  switch (NumOps) {
+  default: llvm_unreachable("Unexpected number of operands");
+  case 4:
+    if (EltVT != MVT::v1i16 && EltVT != MVT::v1i32)
+      return SDValue();
+    break;
+  case 8:
+    if (EltVT != MVT::v1i8 && EltVT != MVT::v1i16)
+      return SDValue();
+    break;
+  case 16:
+    if (EltVT != MVT::v1i8)
+      return SDValue();
+    break;
+  }
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  // VDUP produces better code for constants.
+  if (Op0->getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Op0->getOperand(0));
+  return DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, Op0,
+                     DAG.getConstant(0, MVT::i64));
+}
+
 SDValue
 AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                                       bool IsSigned) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG, IsSigned);
   if (Op.getOperand(0).getValueType() != MVT::f128) {
     // It's legal except when f128 is involved
     return Op;
@@ -2098,6 +2362,9 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) co
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2254,6 +2521,36 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
   }
 }
 
+SDValue
+AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                         SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy();
+  ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = CN->getConstVal();
+
+  switch(getTargetMachine().getCodeModel()) {
+  case CodeModel::Small:
+    // The most efficient code is PC-relative anyway for the small memory model,
+    // so we don't need to worry about relocation model.
+    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
+                                                 AArch64II::MO_NO_FLAG),
+                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
+                                                 AArch64II::MO_LO12),
+                       DAG.getConstant(CN->getAlignment(), MVT::i32));
+  case CodeModel::Large:
+    return DAG.getNode(
+      AArch64ISD::WrapperLarge, DL, PtrVT,
+      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
+      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
+      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
+      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
+  default:
+    llvm_unreachable("Only small and large code models supported now");
+  }
+}
+
 SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
                                                 SDValue DescAddr,
                                                 SDLoc DL,
@@ -2391,9 +2688,42 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                                    bool IsSigned) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Vec = Op.getOperand(0);
+  unsigned Opc = IsSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+
+  if (VT.getVectorNumElements() == 1) {
+    assert(VT == MVT::v1f64 && "Unexpected vector type!");
+    if (VT.getSizeInBits() == Vec.getValueSizeInBits())
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  if (VT.getSizeInBits() < Vec.getValueSizeInBits()) {
+    assert(Vec.getValueType() == MVT::v2i64 && VT == MVT::v2f32 &&
+           "Unexpected vector type!");
+    Vec = DAG.getNode(Opc, dl, MVT::v2f64, Vec);
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, Vec, DAG.getIntPtrConstant(0));
+  } else if (VT.getSizeInBits() > Vec.getValueSizeInBits()) {
+    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
+                                   VT.getVectorElementType().getSizeInBits());
+    CastVT =
+        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
+    Vec = DAG.getNode(CastOpc, dl, CastVT, Vec);
+  }
+
+  return DAG.getNode(Opc, dl, VT, Vec);
+}
+
 SDValue
 AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
                                       bool IsSigned) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG, IsSigned);
   if (Op.getValueType() != MVT::f128) {
     // Legal for everything except f128.
     return Op;
@@ -2436,62 +2766,6 @@ AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-
-  if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to libcalls, but slot in nicely here
-    // afterwards.
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
-
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
-
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                       CmpOp, IfTrue, IfFalse, A64cc);
-  }
-
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
-                                     Op.getValueType(),
-                                     SetCC, IfTrue, IfFalse, A64cc);
-
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, IfTrue, A64SELECT_CC, A64cc);
-
-  }
-
-  return A64SELECT_CC;
-}
-
 // (SELECT testbit, iftrue, iffalse)
 SDValue
 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -2779,10 +3053,157 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return A64SELECT_CC;
 }
 
+static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue IfTrue = Op.getOperand(2);
+  SDValue IfFalse = Op.getOperand(3);
+  EVT IfTrueVT = IfTrue.getValueType();
+  EVT CondVT = IfTrueVT.changeVectorElementTypeToInteger();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
+  // use NEON compare.
+  if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
+    EVT EltVT = LHS.getValueType();
+    unsigned EltNum = 128 / EltVT.getSizeInBits();
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
+    unsigned SubConstant =
+        (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
+    EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
+    EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
+
+    LHS
+      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+                  VT, DAG.getTargetConstant(0, MVT::i32), LHS,
+                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
+    RHS
+      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+                  VT, DAG.getTargetConstant(0, MVT::i32), RHS,
+                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
+
+    SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
+    SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
+    if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
+      EVT DUPVT =
+          EVT::getVectorVT(*DAG.getContext(), CEltT,
+                           IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
+      ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
+                          DAG.getConstant(0, MVT::i64, false));
+
+      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
+    } else {
+      // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
+      // can't handle them and will hit this assert.
+      assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
+             "Vector of IfTrue & IfFalse is too small.");
+
+      unsigned ExEltNum =
+          EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
+      EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
+      ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
+                          DAG.getConstant(0, MVT::i64, false));
+      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
+    }
+    SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
+                                  ResCC, IfTrue, IfFalse);
+    return VSelect;
+  }
+
+  // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
+  // vectors.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  EVT SEVT = MVT::i32;
+  if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
+    SEVT = MVT::i64;
+  SDValue AllOne = DAG.getConstant(-1, SEVT);
+  SDValue AllZero = DAG.getConstant(0, SEVT);
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
+                                     AllOne, AllZero, A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                               SetCC, AllOne, A64SELECT_CC, A64cc);
+  }
+  SDValue VDup;
+  if (IfTrue.getValueType().getVectorNumElements() == 1)
+    VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, CondVT, A64SELECT_CC);
+  else
+    VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, CondVT, A64SELECT_CC);
+  SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
+                                VDup, IfTrue, IfFalse);
+  return VSelect;
+}
+
+// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
+SDValue
+AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue IfTrue = Op.getOperand(2);
+  SDValue IfFalse = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  if (IfTrue.getValueType().isVector())
+    return LowerVectorSELECT_CC(Op, DAG);
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons are lowered to libcalls, but slot in nicely here
+    // afterwards.
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
+                       IfTrue, IfFalse, A64cc);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
+                                     Op.getValueType(),
+                                     SetCC, IfTrue, IfFalse, A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+
+  }
+
+  return A64SELECT_CC;
+}
+
 SDValue
 AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
   // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
   // rather than just 8.
@@ -2880,10 +3301,15 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
 
+  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
+
   case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
+  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::JumpTable: return LowerJumpTable(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
@@ -2893,6 +3319,7 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VASTART: return LowerVASTART(Op, DAG);
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   }
 
@@ -3413,14 +3840,12 @@ static SDValue PerformORCombine(SDNode *N,
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
       APInt SplatBits1;
       if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                        HasAnyUndefs) &&
-          !HasAnyUndefs && SplatBits0 == ~SplatBits1) {
-        // Canonicalize the vector type to make instruction selection simpler.
-        EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8;
-        SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT,
-                                     N0->getOperand(1), N0->getOperand(0),
-                                     N1->getOperand(0));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Result);
+                                        HasAnyUndefs) && !HasAnyUndefs &&
+          SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+          SplatBits0 == ~SplatBits1) {
+
+        return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
+                           N0->getOperand(0), N1->getOperand(0));
       }
     }
   }
@@ -3506,7 +3931,25 @@ static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
   return (Cnt >= 1 && Cnt <= ElementBits);
 }
 
-/// Checks for immediate versions of vector shifts and lowers them.
+static SDValue GenForSextInreg(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               EVT SrcVT, EVT DestVT, EVT SubRegVT,
+                               const int *Mask, SDValue Src) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Bitcast
+    = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src);
+  SDValue Sext
+    = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast);
+  SDValue ShuffleVec
+    = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask);
+  SDValue ExtractSubreg
+    = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
+                SubRegVT, ShuffleVec,
+                DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0);
+  return ExtractSubreg;
+}
+
+/// Checks for vector shifts and lowers them.
 static SDValue PerformShiftCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const AArch64Subtarget *ST) {
@@ -3515,6 +3958,51 @@ static SDValue PerformShiftCombine(SDNode *N,
   if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
     return PerformSRACombine(N, DCI);
 
+  // We're looking for an SRA/SHL pair to help generating instruction
+  //   sshll  v0.8h, v0.8b, #0
+  // The instruction STXL is also the alias of this instruction.
+  //
+  // For example, for DAG like below,
+  //   v2i32 = sra (v2i32 (shl v2i32, 16)), 16
+  // we can transform it into
+  //   v2i32 = EXTRACT_SUBREG 
+  //             (v4i32 (suffle_vector
+  //                       (v4i32 (sext (v4i16 (bitcast v2i32))), 
+  //                       undef, (0, 2, u, u)),
+  //             sub_64
+  //
+  // With this transformation we expect to generate "SSHLL + UZIP1"
+  // Sometimes UZIP1 can be optimized away by combining with other context.
+  int64_t ShrCnt, ShlCnt;
+  if (N->getOpcode() == ISD::SRA
+      && (VT == MVT::v2i32 || VT == MVT::v4i16)
+      && isVShiftRImm(N->getOperand(1), VT, ShrCnt)
+      && N->getOperand(0).getOpcode() == ISD::SHL
+      && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) {
+    SDValue Src = N->getOperand(0).getOperand(0);
+    if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) {
+      // sext_inreg(v2i32, v2i16)
+      // We essentially only care the Mask {0, 2, u, u}
+      int Mask[4] = {0, 2, 4, 6};
+      return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32,
+                             Mask, Src); 
+    }
+    else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) {
+      // sext_inreg(v2i16, v2i8)
+      // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u}
+      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
+      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32,
+                             Mask, Src);
+    }
+    else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) {
+      // sext_inreg(v4i16, v4i8)
+      // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u}
+      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
+      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16,
+                             Mask, Src);
+    }
+  }
+
   // Nothing to be done for scalar shifts.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!VT.isVector() || !TLI.isTypeLegal(VT))
@@ -3796,6 +4284,89 @@ static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return SDValue(N, 0);
 }
 
+// v1i1 setcc ->
+//     v1i1 (bitcast (i1 setcc (extract_vector_elt, extract_vector_elt))
+// FIXME: Currently the type legalizer can't handle SETCC having v1i1 as result.
+// If it can legalize "v1i1 SETCC" correctly, no need to combine such SETCC.
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT ResVT = N->getValueType(0);
+
+  if (!ResVT.isVector() || ResVT.getVectorNumElements() != 1 ||
+      ResVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT CmpVT = LHS.getValueType();
+  LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                    CmpVT.getVectorElementType(), LHS,
+                    DAG.getConstant(0, MVT::i64));
+  RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                    CmpVT.getVectorElementType(), RHS,
+                    DAG.getConstant(0, MVT::i64));
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), MVT::i1, LHS, RHS,
+                   cast<CondCodeSDNode>(N->getOperand(2))->get());
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT, SetCC);
+}
+
+// vselect (v1i1 setcc) ->
+//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT CCVT = N0.getValueType();
+
+  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+      CCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = N0.getOperand(0).getValueType();
+  // Only combine when the result type is of the same size as the compared
+  // operands.
+  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+    return SDValue();
+
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   N0.getOperand(0), N0.getOperand(1),
+                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+                     IfTrue, IfFalse);
+}
+
+// sign_extend (extract_vector_elt (v1i1 setcc)) ->
+//     extract_vector_elt (v1iXX setcc)
+// (XX is the size of the compared operand type)
+static SDValue PerformSignExtendCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue Vec = N0.getOperand(0);
+
+  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      Vec.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = Vec.getOperand(0).getValueType();
+  // Only optimize when the result type is of the same size as the element
+  // type of the compared operand.
+  if (ResVT.getSizeInBits() != CmpVT.getVectorElementType().getSizeInBits())
+    return SDValue();
+
+  SDValue Lane = N0.getOperand(1);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   Vec.getOperand(0), Vec.getOperand(1),
+                   cast<CondCodeSDNode>(Vec.getOperand(2))->get());
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ResVT,
+                     SetCC, Lane);
+}
+
 SDValue
 AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                          DAGCombinerInfo &DCI) const {
@@ -3807,6 +4378,9 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRA:
   case ISD::SRL:
     return PerformShiftCombine(N, DCI, getSubtarget());
+  case ISD::SETCC: return PerformSETCCCombine(N, DCI.DAG);
+  case ISD::VSELECT: return PerformVSelectCombine(N, DCI.DAG);
+  case ISD::SIGN_EXTEND: return PerformSignExtendCombine(N, DCI.DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return PerformIntrinsicCombine(N, DCI.DAG);
   case AArch64ISD::NEON_VDUPLANE:
@@ -3866,22 +4440,76 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 
   return false;
 }
+// Check whether a shuffle_vector could be presented as concat_vector.
+bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
+                                           SDValue V0, SDValue V1,
+                                           const int *Mask,
+                                           SDValue &Res) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  if (VT.getSizeInBits() != 128)
+    return false;
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isContactVector = true;
+  bool splitV0 = false;
+  if (V0.getValueType().getSizeInBits() == 128)
+    splitV0 = true;
+
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I) {
+      isContactVector = false;
+      break;
+    }
+  }
 
-// Check whether a Build Vector could be presented as Shuffle Vector. If yes,
-// try to call LowerVECTOR_SHUFFLE to lower it.
+  if (isContactVector) {
+    int offset = NumElts / 2;
+    for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+      if (Mask[I] != I + splitV0 * offset) {
+        isContactVector = false;
+        break;
+      }
+    }
+  }
+
+  if (isContactVector) {
+    EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                  NumElts / 2);
+    if (splitV0) {
+      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                       DAG.getConstant(0, MVT::i64));
+    }
+    if (V1.getValueType().getSizeInBits() == 128) {
+      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                       DAG.getConstant(0, MVT::i64));
+    }
+    Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+    return true;
+  }
+  return false;
+}
+
+// Check whether a Build Vector could be presented as Shuffle Vector.
+// This Shuffle Vector maybe not legalized, so the length of its operand and
+// the length of result may not equal.
 bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
-                                                 SDValue &Res) const {
+                                                 SDValue &V0, SDValue &V1,
+                                                 int *Mask) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned V0NumElts = 0;
-  int Mask[16];
-  SDValue V0, V1;
 
   // Check if all elements are extracted from less than 3 vectors.
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue Elt = Op.getOperand(i);
-    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Elt.getOperand(0).getValueType().getVectorElementType() !=
+            VT.getVectorElementType())
       return false;
 
     if (V0.getNode() == 0) {
@@ -3902,22 +4530,86 @@ bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
       return false;
     }
   }
+  return true;
+}
 
-  if (!V1.getNode() && V0NumElts == NumElts * 2) {
-    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                     DAG.getConstant(NumElts, MVT::i64));
-    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                     DAG.getConstant(0, MVT::i64));
-    V0NumElts = V0.getValueType().getVectorNumElements();
-  }
+// LowerShiftRightParts - Lower SRL_PARTS and SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Tmp3 = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+
+  SDValue A64cc;
+  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
+                                        DAG.getConstant(0, MVT::i64),
+                                        ISD::SETGE, A64cc,
+                                        DAG, dl);
+
+  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
+                           DAG.getConstant(0, Tmp3.getValueType()), Tmp3,
+                           A64cc);
+  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
+                           TrueVal, FalseVal, A64cc);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
 
-  if (V1.getNode() && NumElts == V0NumElts &&
-      V0NumElts == V1.getValueType().getVectorNumElements()) {
-    SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
-    Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
-    return true;
-  } else
-    return false;
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue Tmp4 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+  SDValue A64cc;
+  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
+                                        DAG.getConstant(0, MVT::i64),
+                                        ISD::SETGE, A64cc,
+                                        DAG, dl);
+
+  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
+                           DAG.getConstant(0, Tmp4.getValueType()), Tmp4,
+                           A64cc);
+  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
+                           Tmp3, FalseVal, A64cc);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
 }
 
 // If this is a case we can't handle, return null and let the default
@@ -4027,9 +4719,7 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (ValueCounts.size() == 0)
     return DAG.getUNDEF(VT);
 
-  // Loads are better lowered with insert_vector_elt.
-  // Keep going if we are hitting this case.
-  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+  if (isOnlyLowElement)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
 
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
@@ -4042,10 +4732,60 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       // just use DUPLANE. We can only do this if the lane being extracted
       // is at a constant index, as the DUP from lane instructions only have
       // constant-index forms.
-      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(Value->getOperand(1))) {
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
-                        Value->getOperand(0), Value->getOperand(1));
+      //
+      // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
+      // remove TRUNCATE for DUPLANE by apdating the source vector to
+      // appropriate vector type and lane index.
+      //
+      // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
+      // are not legal any more, no need to check the type size in bits should
+      // be large than 64.
+      SDValue V = Value;
+      if (Value->getOpcode() == ISD::TRUNCATE)
+        V = Value->getOperand(0);
+      if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(V->getOperand(1)) &&
+          V->getOperand(0).getValueType().getSizeInBits() >= 64) {
+
+        // If the element size of source vector is larger than DUPLANE
+        // element size, we can do transformation by,
+        // 1) bitcasting source register to smaller element vector
+        // 2) mutiplying the lane index by SrcEltSize/ResEltSize
+        // For example, we can lower
+        //     "v8i16 vdup_lane(v4i32, 1)"
+        // to be
+        //     "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
+        SDValue SrcVec = V->getOperand(0);
+        unsigned SrcEltSize =
+            SrcVec.getValueType().getVectorElementType().getSizeInBits();
+        unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
+        if (SrcEltSize > ResEltSize) {
+          assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
+          SDValue BitCast;
+          unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
+          unsigned ResSize = VT.getSizeInBits();
+
+          if (SrcSize > ResSize) {
+            assert((SrcSize % ResSize == 0) && "Invalid vector size");
+            EVT CastVT =
+                EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                 SrcSize / ResEltSize);
+            BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
+          } else {
+            assert((SrcSize == ResSize) && "Invalid vector size of source vec");
+            BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
+          }
+
+          unsigned LaneIdx = V->getConstantOperandVal(1);
+          SDValue Lane =
+              DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
+        } else {
+          assert((SrcEltSize == ResEltSize) &&
+                 "Invalid element size of source vec");
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
+                          V->getOperand(1));
+        }
       } else
         N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
 
@@ -4075,9 +4815,31 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   // Try to lower this in lowering ShuffleVector way.
-  SDValue Shuf;
-  if (isKnownShuffleVector(Op, DAG, Shuf))
-    return Shuf;
+  SDValue V0, V1;
+  int Mask[16];
+  if (isKnownShuffleVector(Op, DAG, V0, V1, Mask)) {
+    unsigned V0NumElts = V0.getValueType().getVectorNumElements();
+    if (!V1.getNode() && V0NumElts == NumElts * 2) {
+      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                       DAG.getConstant(NumElts, MVT::i64));
+      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                       DAG.getConstant(0, MVT::i64));
+      V0NumElts = V0.getValueType().getVectorNumElements();
+    }
+
+    if (V1.getNode() && NumElts == V0NumElts &&
+        V0NumElts == V1.getValueType().getVectorNumElements()) {
+      SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
+      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+        return Shuffle;
+      else
+        return LowerVECTOR_SHUFFLE(Shuffle, DAG);
+    } else {
+      SDValue Res;
+      if (isConcatVector(Op, DAG, V0, V1, Mask, Res))
+        return Res;
+    }
+  }
 
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
   // know the default expansion would otherwise fall back on something even
@@ -4131,7 +4893,7 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
 
 // isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
 // TRN instruction.
-static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
+static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
   unsigned NumElts = VT.getVectorNumElements();
   if (NumElts < 4)
     return 0;
@@ -4140,7 +4902,10 @@ static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
 
   // Check UZP1
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned)M[i] != i * 2) {
+    unsigned answer = i * 2;
+    if (isV2undef && answer >= NumElts)
+      answer -= NumElts;
+    if (M[i] != -1 && (unsigned)M[i] != answer) {
       ismatch = false;
       break;
     }
@@ -4151,7 +4916,10 @@ static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
   // Check UZP2
   ismatch = true;
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned)M[i] != i * 2 + 1) {
+    unsigned answer = i * 2 + 1;
+    if (isV2undef && answer >= NumElts)
+      answer -= NumElts;
+    if (M[i] != -1 && (unsigned)M[i] != answer) {
       ismatch = false;
       break;
     }
@@ -4162,7 +4930,10 @@ static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
   // Check ZIP1
   ismatch = true;
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) {
+    unsigned answer = i / 2 + NumElts * (i % 2);
+    if (isV2undef && answer >= NumElts)
+      answer -= NumElts;
+    if (M[i] != -1 && (unsigned)M[i] != answer) {
       ismatch = false;
       break;
     }
@@ -4173,7 +4944,10 @@ static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
   // Check ZIP2
   ismatch = true;
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) {
+    unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
+    if (isV2undef && answer >= NumElts)
+      answer -= NumElts;
+    if (M[i] != -1 && (unsigned)M[i] != answer) {
       ismatch = false;
       break;
     }
@@ -4184,7 +4958,10 @@ static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
   // Check TRN1
   ismatch = true;
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) {
+    unsigned answer = i + (NumElts - 1) * (i % 2);
+    if (isV2undef && answer >= NumElts)
+      answer -= NumElts;
+    if (M[i] != -1 && (unsigned)M[i] != answer) {
       ismatch = false;
       break;
     }
@@ -4195,7 +4972,10 @@ static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
   // Check TRN2
   ismatch = true;
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) {
+    unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
+    if (isV2undef && answer >= NumElts)
+      answer -= NumElts;
+    if (M[i] != -1 && (unsigned)M[i] != answer) {
       ismatch = false;
       break;
     }
@@ -4232,9 +5012,22 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   if (isREVMask(ShuffleMask, VT, 16))
     return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
 
-  unsigned ISDNo = isPermuteMask(ShuffleMask, VT);
-  if (ISDNo)
-    return DAG.getNode(ISDNo, dl, VT, V1, V2);
+  unsigned ISDNo;
+  if (V2.getOpcode() == ISD::UNDEF)
+    ISDNo = isPermuteMask(ShuffleMask, VT, true);
+  else
+    ISDNo = isPermuteMask(ShuffleMask, VT, false);
+
+  if (ISDNo) {
+    if (V2.getOpcode() == ISD::UNDEF)
+      return DAG.getNode(ISDNo, dl, VT, V1, V1);
+    else
+      return DAG.getNode(ISDNo, dl, VT, V1, V2);
+  }
+
+  SDValue Res;
+  if (isConcatVector(Op, DAG, V1, V2, &ShuffleMask[0], Res))
+    return Res;
 
   // If the element of shuffle mask are all the same constant, we can
   // transform it into either NEON_VDUP or NEON_VDUPLANE
@@ -4289,22 +5082,28 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // it into NEON_VEXTRACT.
   if (V1EltNum == Length) {
     // Check if the shuffle mask is sequential.
-    bool IsSequential = true;
-    int CurMask = ShuffleMask[0];
-    for (int I = 0; I < Length; ++I) {
-      if (ShuffleMask[I] != CurMask) {
-        IsSequential = false;
-        break;
-      }
-      CurMask++;
+    int SkipUndef = 0;
+    while (ShuffleMask[SkipUndef] == -1) {
+      SkipUndef++;
     }
-    if (IsSequential) {
-      assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
-      unsigned VecSize = EltSize * V1EltNum;
-      unsigned Index = (EltSize/8) * ShuffleMask[0];
-      if (VecSize == 64 || VecSize == 128)
-        return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
-                           DAG.getConstant(Index, MVT::i64));
+    int CurMask = ShuffleMask[SkipUndef];
+    if (CurMask >= SkipUndef) {
+      bool IsSequential = true;
+      for (int I = SkipUndef; I < Length; ++I) {
+        if (ShuffleMask[I] != -1 && ShuffleMask[I] != CurMask) {
+          IsSequential = false;
+          break;
+        }
+        CurMask++;
+      }
+      if (IsSequential) {
+        assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
+        unsigned VecSize = EltSize * V1EltNum;
+        unsigned Index = (EltSize / 8) * (ShuffleMask[SkipUndef] - SkipUndef);
+        if (VecSize == 64 || VecSize == 128)
+          return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
+                             DAG.getConstant(Index, MVT::i64));
+      }
     }
   }
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 8ad5a79..e946b25 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -18,8 +18,8 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
 namespace AArch64ISD {
@@ -113,9 +113,6 @@ namespace AArch64ISD {
     // get selected.
     WrapperSmall,
 
-    // Vector bitwise select
-    NEON_BSL,
-
     // Vector move immediate
     NEON_MOVIMM,
 
@@ -224,6 +221,8 @@ public:
                       const SmallVectorImpl<SDValue> &OutVals,
                       SDLoc dl, SelectionDAG &DAG) const;
 
+  virtual unsigned getByValTypeAlignment(Type *Ty) const override;
+
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const;
 
@@ -233,7 +232,14 @@ public:
                           SDLoc dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
-  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+
+  bool isConcatVector(SDValue Op, SelectionDAG &DAG, SDValue V0, SDValue V1,
+                      const int *Mask, SDValue &Res) const;
+
+  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &V0,
+                            SDValue &V1, int *Mask) const;
 
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                             const AArch64Subtarget *ST) const;
@@ -309,6 +315,8 @@ public:
   SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
                            SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -341,7 +349,7 @@ public:
   getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
 
   virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                                  unsigned Intrinsic) const LLVM_OVERRIDE;
+                                  unsigned Intrinsic) const override;
 
 protected:
   std::pair<const TargetRegisterClass*, uint8_t>
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 34f917c..4cc3813 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -346,15 +346,13 @@ class A64I_dp3<bit sf, bits<6> opcode,
                dag outs, dag ins, string asmstr,
                list<dag> patterns, InstrItinClass itin>
   : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<5> Ra;
-
   let Inst{31} = sf;
   let Inst{30-29} = opcode{5-4};
   let Inst{28-24} = 0b11011;
   let Inst{23-21} = opcode{3-1};
   // Inherits Rm in 20-16
   let Inst{15} = opcode{0};
-  let Inst{14-10} = Ra;
+  // {14-10} mostly Ra, but unspecified for SMULH/UMULH
   // Inherits Rn in 9-5
   // Inherits Rd in 4-0
 }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 180110a..afb2034 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -26,7 +26,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
-
 #include <algorithm>
 
 #define GET_INSTRINFO_CTOR_DTOR
@@ -133,8 +132,29 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addImm(16);
       return;
     }
+  } else if (AArch64::FPR8RegClass.contains(DestReg, SrcReg)) {
+    // The copy of two FPR8 registers is implemented by the copy of two FPR32
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_8,
+                                            &AArch64::FPR32RegClass);
+    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_8,
+                                            &AArch64::FPR32RegClass);
+    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
+      .addReg(Src);
+    return;
+  } else if (AArch64::FPR16RegClass.contains(DestReg, SrcReg)) {
+    // The copy of two FPR16 registers is implemented by the copy of two FPR32
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_16,
+                                            &AArch64::FPR32RegClass);
+    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_16,
+                                            &AArch64::FPR32RegClass);
+    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
+      .addReg(Src);
+    return;
   } else {
-    llvm_unreachable("Unknown register class in copyPhysReg");
+    CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg);
+    return;
   }
 
   // E.g. ORR xDst, xzr, xSrc, lsl #0
@@ -144,6 +164,55 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
+void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        DebugLoc DL, unsigned DestReg,
+                                        unsigned SrcReg) const {
+  unsigned SubRegs;
+  bool IsQRegs;
+  if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) {
+    SubRegs = 2;
+    IsQRegs = false;
+  } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) {
+    SubRegs = 3;
+    IsQRegs = false;
+  } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) {
+    SubRegs = 4;
+    IsQRegs = false;
+  } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) {
+    SubRegs = 2;
+    IsQRegs = true;
+  } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) {
+    SubRegs = 3;
+    IsQRegs = true;
+  } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) {
+    SubRegs = 4;
+    IsQRegs = true;
+  } else
+    llvm_unreachable("Unknown register class");
+
+  unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0;
+  int Spacing = 1;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  // Copy register tuples backward when the first Dest reg overlaps
+  // with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + (SubRegs - 1);
+    Spacing = -1;
+  }
+
+  unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B;
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+    assert(Dst && Src && "Bad sub-register");
+    BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
+        .addReg(Src)
+        .addReg(Src);
+  }
+  return;
+}
+
 /// Does the Opcode represent a conditional branch that we can remove and re-add
 /// at the end of a basic block?
 static bool isCondBranch(unsigned Opc) {
@@ -418,10 +487,12 @@ AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     default:
       llvm_unreachable("Unknown size for regclass");
     }
-  } else {
-    assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-            RC->hasType(MVT::f128))
-           && "Expected integer or floating type for store");
+  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
+    StoreOp = AArch64::LSFP8_STR;
+  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
+    StoreOp = AArch64::LSFP16_STR;
+  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
+             RC->hasType(MVT::f128)) {
     switch (RC->getSize()) {
     case 4: StoreOp = AArch64::LSFP32_STR; break;
     case 8: StoreOp = AArch64::LSFP64_STR; break;
@@ -429,6 +500,28 @@ AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     default:
       llvm_unreachable("Unknown size for regclass");
     }
+  } else { // For a super register class has more than one sub registers
+    if (AArch64::DPairRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x2_8B;
+    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x3_8B;
+    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x4_8B;
+    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x2_16B;
+    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x3_16B;
+    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
+      StoreOp = AArch64::ST1x4_16B;
+    else
+      llvm_unreachable("Unknown reg class");
+
+    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
+    // Vector store has different operands from other store instructions.
+    NewMI.addFrameIndex(FrameIdx)
+         .addReg(SrcReg, getKillRegState(isKill))
+         .addMemOperand(MMO);
+    return;
   }
 
   MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
@@ -464,10 +557,12 @@ AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     default:
       llvm_unreachable("Unknown size for regclass");
     }
-  } else {
-    assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64)
-            || RC->hasType(MVT::f128))
-           && "Expected integer or floating type for store");
+  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
+    LoadOp = AArch64::LSFP8_LDR;
+  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
+    LoadOp = AArch64::LSFP16_LDR;
+  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
+             RC->hasType(MVT::f128)) {
     switch (RC->getSize()) {
     case 4: LoadOp = AArch64::LSFP32_LDR; break;
     case 8: LoadOp = AArch64::LSFP64_LDR; break;
@@ -475,6 +570,27 @@ AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     default:
       llvm_unreachable("Unknown size for regclass");
     }
+  } else { // For a super register class has more than one sub registers
+    if (AArch64::DPairRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x2_8B;
+    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x3_8B;
+    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x4_8B;
+    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x2_16B;
+    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x3_16B;
+    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
+      LoadOp = AArch64::LD1x4_16B;
+    else
+      llvm_unreachable("Unknown reg class");
+
+    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
+    // Vector load has different operands from other load instructions.
+    NewMI.addFrameIndex(FrameIdx)
+         .addMemOperand(MMO);
+    return;
   }
 
   MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
@@ -513,7 +629,8 @@ void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
                                              int &AccessScale, int &MinOffset,
                                              int &MaxOffset) const {
   switch (MI.getOpcode()) {
-  default: llvm_unreachable("Unkown load/store kind");
+  default:
+    llvm_unreachable("Unknown load/store kind");
   case TargetOpcode::DBG_VALUE:
     AccessScale = 1;
     MinOffset = INT_MIN;
@@ -572,6 +689,32 @@ void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
     MinOffset = -0x40 * AccessScale;
     MaxOffset = 0x3f * AccessScale;
     return;
+  case AArch64::LD1x2_8B: case AArch64::ST1x2_8B:
+    AccessScale = 16;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LD1x3_8B: case AArch64::ST1x3_8B:
+    AccessScale = 24;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LD1x4_8B: case AArch64::ST1x4_8B:
+  case AArch64::LD1x2_16B: case AArch64::ST1x2_16B:
+    AccessScale = 32;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LD1x3_16B: case AArch64::ST1x3_16B:
+    AccessScale = 48;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LD1x4_16B: case AArch64::ST1x4_16B:
+    AccessScale = 64;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
   }
 }
 
@@ -587,18 +730,15 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (MI.getOpcode() == AArch64::INLINEASM)
     return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
 
-  if (MI.isLabel())
-    return 0;
-
   switch (MI.getOpcode()) {
   case TargetOpcode::BUNDLE:
     return getInstBundleLength(MI);
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
-  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::CFI_INSTRUCTION:
   case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
   case TargetOpcode::DBG_VALUE:
-    return 0;
   case AArch64::TLSDESCCALL:
     return 0;
   default:
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 620ecc9..ad20f9c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_TARGET_AARCH64INSTRINFO_H
 #define LLVM_TARGET_AARCH64INSTRINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "AArch64RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AArch64GenInstrInfo.inc"
@@ -42,6 +42,9 @@ public:
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const;
+  void CopyPhysRegTuple(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I, DebugLoc DL,
+                        unsigned DestReg, unsigned SrcReg) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 23d81fc..7d7a641 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -27,6 +27,15 @@ def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
 include "AArch64InstrFormats.td"
 
 //===----------------------------------------------------------------------===//
+//  AArch64 specific pattern fragments.
+//
+// An 'fmul' node with a single use.
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
+  return N->hasOneUse();
+}]>;
+
+
+//===----------------------------------------------------------------------===//
 // Target-specific ISD nodes and profiles
 //===----------------------------------------------------------------------===//
 
@@ -341,33 +350,39 @@ multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
                     outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
                     !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                     [(opfrag exts.ty:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
-                    NoItinerary>;
+                    NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
     def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
                     outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
                     !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                     [(opfrag exts.ty:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
-                    NoItinerary>;
+                    NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
     def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
                     outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
                     !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                     [(opfrag exts.ty:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
-                    NoItinerary>;
+                    NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
 
     def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
                     outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
                     !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                     [(opfrag exts.ty:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
-                    NoItinerary>;
+                    NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
     def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
                     outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
                     !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                     [(opfrag exts.ty:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
-                    NoItinerary>;
+                    NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
     def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
                     outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
                     !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                     [(opfrag exts.ty:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
-                    NoItinerary>;
+                    NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
 }
 
 // These two could be merge in with the above, but their patterns aren't really
@@ -379,30 +394,32 @@ multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
                    (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                    [(opfrag i64:$Rn, (shl i64:$Rm, UXTX_operand:$Imm3))],
-                   NoItinerary>;
+                   NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
 
     def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
                    outs,
                    (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
                    [/* No Pattern: same as uxtx */],
-                   NoItinerary>;
+                   NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
 }
 
 multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
     def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
-                              outs,
-                              (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
-                              !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                              [/* No pattern: probably same as uxtw */],
-                              NoItinerary>;
+                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
+                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                   [/* No pattern: probably same as uxtw */],
+                   NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
 
     def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
-                              outs,
-                              (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
-                              !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                              [/* No Pattern: probably same as uxtw */],
-                              NoItinerary>;
+                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
+                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                   [/* No Pattern: probably same as uxtw */],
+                   NoItinerary>,
+                 Sched<[WriteALU, ReadALU, ReadALU]>;
 }
 
 class SetRD<RegisterClass RC, SDPatternOperator op>
@@ -446,7 +463,7 @@ defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
                          (outs GPR32:$Rd)>;
 
 
-let Rd = 0b11111, isCompare = 1 in {
+let SchedRW = [WriteCMP, ReadCMP, ReadCMP], Rd = 0b11111, isCompare = 1 in {
 defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
                         (outs), extends_to_i64>,
             addsub_xxtx<     0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
@@ -648,7 +665,8 @@ multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
                          (ins GPRsp:$Rn, imm_operand:$Imm12),
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
                          [(set Ty:$Rd, (add Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>;
+                         NoItinerary>,
+           Sched<[WriteALU, ReadALU]>;
 
 
   // S variants can read SP but would write to ZR
@@ -657,7 +675,8 @@ multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
                          (ins GPRsp:$Rn, imm_operand:$Imm12),
                          !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
                          [(set Ty:$Rd, (addc Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary> {
+                         NoItinerary>,
+           Sched<[WriteALU, ReadALU]> {
     let Defs = [NZCV];
   }
 
@@ -669,7 +688,8 @@ multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
                             !strconcat(cmpasmop, " $Rn, $Imm12"),
                             [(set NZCV,
                                   (A64cmp Ty:$Rn, cmp_imm_operand:$Imm12))],
-                            NoItinerary> {
+                            NoItinerary>,
+           Sched<[WriteCMP, ReadCMP]> {
     let Rd = 0b11111;
     let Defs = [NZCV];
     let isCompare = 1;
@@ -731,7 +751,7 @@ defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
 // Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
 
 //===-------------------------------
-// 1. The "shifed register" operands. Shared with logical insts.
+// 1. The "shifted register" operands. Shared with logical insts.
 //===-------------------------------
 
 multiclass shift_operands<string prefix, string form> {
@@ -791,7 +811,8 @@ multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
                        [(set GPR:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
                             !cast<Operand>("lsl_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU]>;
 
   def _lsr : A64I_addsubshift<sf, op, s, 0b01,
                        (outs GPR:$Rd),
@@ -801,7 +822,8 @@ multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
                        [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
                             !cast<Operand>("lsr_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU]>;
 
   def _asr : A64I_addsubshift<sf, op, s, 0b10,
                        (outs GPR:$Rd),
@@ -811,7 +833,8 @@ multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
                        [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
                             !cast<Operand>("asr_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU]>;
   }
 
   def _noshift
@@ -897,7 +920,8 @@ multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
                        [(set NZCV, (opfrag ty:$Rn, (shl ty:$Rm,
                             !cast<Operand>("lsl_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
 
   def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
                        (outs),
@@ -907,7 +931,8 @@ multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
                        [(set NZCV, (opfrag ty:$Rn, (srl ty:$Rm,
                             !cast<Operand>("lsr_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
 
   def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
                        (outs),
@@ -917,7 +942,8 @@ multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
                        [(set NZCV, (opfrag ty:$Rn, (sra ty:$Rm,
                             !cast<Operand>("asr_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
   }
 
   def _noshift
@@ -944,12 +970,14 @@ multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
     def www : A64I_addsubcarry<0b0, op, s, 0b000000,
                                (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
                                !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>;
+                               [], NoItinerary>,
+              Sched<[WriteALU, ReadALU, ReadALU]>;
 
     def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
                                (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
                                !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>;
+                               [], NoItinerary>,
+              Sched<[WriteALU, ReadALU, ReadALU]>;
   }
 }
 
@@ -1035,14 +1063,16 @@ multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
   def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
                     (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
                     !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary> {
+                    [], NoItinerary>,
+             Sched<[WriteALU, ReadALU]> {
     let DecoderMethod = "DecodeBitfieldInstruction";
   }
 
   def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
                     (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
                     !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary> {
+                    [], NoItinerary>,
+             Sched<[WriteALU, ReadALU]> {
     let DecoderMethod = "DecodeBitfieldInstruction";
   }
 }
@@ -1055,7 +1085,8 @@ defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
 def BFMwwii :
   A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
         (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
+  Sched<[WriteALU, ReadALU, ReadALU]> {
   let DecoderMethod = "DecodeBitfieldInstruction";
   let Constraints = "$src = $Rd";
 }
@@ -1063,7 +1094,8 @@ def BFMwwii :
 def BFMxxii :
   A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
         (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
+  Sched<[WriteALU, ReadALU, ReadALU]> {
   let DecoderMethod = "DecodeBitfieldInstruction";
   let Constraints = "$src = $Rd";
 }
@@ -1085,7 +1117,8 @@ class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, ValueType dty,
   : A64I_bitfield<sf, opc, sf,
                   (outs GPRDest:$Rd), (ins GPR32:$Rn),
                   !strconcat(asmop, "\t$Rd, $Rn"),
-                  [(set dty:$Rd, pattern)], NoItinerary> {
+                  [(set dty:$Rd, pattern)], NoItinerary>,
+    Sched<[WriteALU, ReadALU]> {
   let ImmR = 0b000000;
   let ImmS = imms;
 }
@@ -1139,7 +1172,8 @@ multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
                     (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
                     !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
                     [(set i32:$Rd, (opnode i32:$Rn, bitfield32_imm:$ImmR))],
-                    NoItinerary> {
+                    NoItinerary>,
+            Sched<[WriteALU, ReadALU]> {
     let ImmS = 31;
   }
 
@@ -1147,7 +1181,8 @@ multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
                     (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
                     !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
                     [(set i64:$Rd, (opnode i64:$Rn, bitfield64_imm:$ImmR))],
-                    NoItinerary> {
+                    NoItinerary>,
+            Sched<[WriteALU, ReadALU]> {
     let ImmS = 63;
   }
 
@@ -1188,7 +1223,8 @@ class A64I_bitfield_lsl<bit sf, RegisterClass GPR, ValueType ty,
   : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
                   "lsl\t$Rd, $Rn, $FullImm",
                   [(set ty:$Rd, (shl ty:$Rn, operand:$FullImm))],
-                  NoItinerary> {
+                  NoItinerary>,
+    Sched<[WriteALU, ReadALU]> {
   bits<12> FullImm;
   let ImmR = FullImm{5-0};
   let ImmS = FullImm{11-6};
@@ -1235,7 +1271,8 @@ multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
                        (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
                        !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
                        [(set i32:$Rd, (op i32:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary> {
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU]> {
     // As above, no disassembler allowed.
     let isAsmParserOnly = 1;
   }
@@ -1244,7 +1281,8 @@ multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
                        (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
                        !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
                        [(set i64:$Rd, (op i64:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary> {
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU]> {
     // As above, no disassembler allowed.
     let isAsmParserOnly = 1;
   }
@@ -1255,16 +1293,18 @@ defm UBFX :  A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
 
 // Again, variants based on BFM modify Rd so need it as an input too.
 def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-           (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-           "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+                          (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
+                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
+                Sched<[WriteALU, ReadALU, ReadALU]> {
   // As above, no disassembler allowed.
   let isAsmParserOnly = 1;
   let Constraints = "$src = $Rd";
 }
 
 def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-           (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-           "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+                          (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
+                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
+                Sched<[WriteALU, ReadALU, ReadALU]> {
   // As above, no disassembler allowed.
   let isAsmParserOnly = 1;
   let Constraints = "$src = $Rd";
@@ -1344,7 +1384,8 @@ multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
   def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
                            (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
                            !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary> {
+                           [], NoItinerary>,
+             Sched<[WriteALU, ReadALU]> {
     // As above, no disassembler allowed.
     let isAsmParserOnly = 1;
   }
@@ -1352,7 +1393,8 @@ multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
   def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
                            (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
                            !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary> {
+                           [], NoItinerary>,
+             Sched<[WriteALU, ReadALU]> {
     // As above, no disassembler allowed.
     let isAsmParserOnly = 1;
   }
@@ -1364,7 +1406,8 @@ defm UBFIZ :  A64I_bitfield_insert<0b10, "ubfiz">;
 
 def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
                 (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
+              Sched<[WriteALU, ReadALU, ReadALU]> {
   // As above, no disassembler allowed.
   let isAsmParserOnly = 1;
   let Constraints = "$src = $Rd";
@@ -1372,7 +1415,8 @@ def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
 
 def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
                 (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
+              Sched<[WriteALU, ReadALU, ReadALU]> {
   // As above, no disassembler allowed.
   let isAsmParserOnly = 1;
   let Constraints = "$src = $Rd";
@@ -1409,14 +1453,16 @@ multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
                      (ins GPR64:$Rt, bcc_target:$Label),
                      !strconcat(asmop,"\t$Rt, $Label"),
                      [(A64br_cc (A64cmp i64:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>;
+                     NoItinerary>,
+          Sched<[WriteBr, ReadBr]>;
 
   def w : A64I_cmpbr<0b0, op,
                      (outs),
                      (ins GPR32:$Rt, bcc_target:$Label),
                      !strconcat(asmop,"\t$Rt, $Label"),
                      [(A64br_cc (A64cmp i32:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>;
+                     NoItinerary>,
+          Sched<[WriteBr, ReadBr]>;
   }
 }
 
@@ -1447,7 +1493,8 @@ def cond_code : Operand<i32>, ImmLeaf<i32, [{
 def Bcc : A64I_condbr<0b0, 0b0, (outs),
                 (ins cond_code:$Cond, bcc_target:$Label),
                 "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
-                NoItinerary> {
+                NoItinerary>,
+          Sched<[WriteBr]> {
   let Uses = [NZCV];
   let isBranch = 1;
   let isTerminator = 1;
@@ -1493,7 +1540,8 @@ class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
   : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
                 (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
                 !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
-                [], NoItinerary> {
+                [], NoItinerary>,
+    Sched<[WriteCMP, ReadCMP]> {
   let Defs = [NZCV];
 }
 
@@ -1512,7 +1560,8 @@ class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
                     (outs),
                     (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
                     !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                    [], NoItinerary> {
+                    [], NoItinerary>,
+    Sched<[WriteCMP, ReadCMP, ReadCMP]> {
   let Defs = [NZCV];
 }
 
@@ -1559,7 +1608,8 @@ multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
                             (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
                             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
                             [(set i32:$Rd, (select i32:$Rn, i32:$Rm))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
 
 
     def xxxc : A64I_condsel<0b1, op, 0b0, op2,
@@ -1567,7 +1617,8 @@ multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
                             (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
                             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
                             [(set i64:$Rd, (select i64:$Rn, i64:$Rm))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
   }
 }
 
@@ -1677,7 +1728,8 @@ class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
                    (outs GPRrc:$Rd),
                    (ins GPRrc:$Rn),
                    patterns,
-                   itin>;
+                   itin>,
+      Sched<[WriteALU, ReadALU]>;
 
 multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
   let hasSideEffects = 0 in {
@@ -1733,7 +1785,8 @@ class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
                    (outs GPRsp:$Rd),
                    (ins GPRsp:$Rn, GPRsp:$Rm),
                    patterns,
-                   itin>;
+                   itin>,
+	  Sched<[WriteALU, ReadALU, ReadALU]>;
 
 multiclass dp_2src_crc<bit c, string asmop> {
   def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
@@ -1745,7 +1798,8 @@ multiclass dp_2src_crc<bit c, string asmop> {
   def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
                            !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
                            (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
-                           NoItinerary>;
+                           NoItinerary>,
+	          Sched<[WriteALU, ReadALU, ReadALU]>;
 }
 
 multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
@@ -1784,13 +1838,17 @@ multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
 defm CRC32  : dp_2src_crc<0b0, "crc32">;
 defm CRC32C : dp_2src_crc<0b1, "crc32c">;
 
-defm UDIV : dp_2src<0b000010, "udiv", udiv>;
-defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
+let SchedRW = [WriteDiv, ReadDiv, ReadDiv] in {
+  defm UDIV : dp_2src<0b000010, "udiv", udiv>;
+  defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
+}
 
-defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
-defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
-defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
-defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
+let SchedRW = [WriteALUs, ReadALU, ReadALU] in {
+  defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
+  defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
+  defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
+  defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
+}
 
 // Extra patterns for an incoming 64-bit value for a 32-bit
 // operation. Since the LLVM operations are undefined (as in C) if the
@@ -1823,7 +1881,11 @@ class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
   : A64I_dp3<sf, opcode,
              (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
              !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
-             [(set AccTy:$Rd, pattern)], NoItinerary> {
+             [(set AccTy:$Rd, pattern)], NoItinerary>,
+    Sched<[WriteMAC, ReadMAC, ReadMAC, ReadMAC]> {
+  bits<5> Ra;
+  let Inst{14-10} = Ra;
+
   RegisterClass AccGPR = AccReg;
   RegisterClass SrcGPR = SrcReg;
 }
@@ -1853,13 +1915,15 @@ let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
                           (ins GPR64:$Rn, GPR64:$Rm),
                           "umulh\t$Rd, $Rn, $Rm",
                           [(set i64:$Rd, (mulhu i64:$Rn, i64:$Rm))],
-                          NoItinerary>;
+                          NoItinerary>,
+                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
 
   def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
                           (ins GPR64:$Rn, GPR64:$Rm),
                           "smulh\t$Rd, $Rn, $Rm",
                           [(set i64:$Rd, (mulhs i64:$Rn, i64:$Rm))],
-                          NoItinerary>;
+                          NoItinerary>,
+                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
 }
 
 multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
@@ -1907,7 +1971,8 @@ def uimm16 : Operand<i32> {
 
 class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
   : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
-                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary> {
+                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary>,
+    Sched<[WriteBr]> {
   let isBranch = 1;
   let isTerminator = 1;
 }
@@ -1938,14 +2003,16 @@ def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
                             "extr\t$Rd, $Rn, $Rm, $LSB",
                             [(set i32:$Rd,
                                   (A64Extr i32:$Rn, i32:$Rm, imm:$LSB))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteALU, ReadALU, ReadALU]>;
 def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
                             (outs GPR64:$Rd),
                             (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
                             "extr\t$Rd, $Rn, $Rm, $LSB",
                             [(set i64:$Rd,
                                   (A64Extr i64:$Rn, i64:$Rm, imm:$LSB))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteALU, ReadALU, ReadALU]>;
 
 def : InstAlias<"ror $Rd, $Rs, $LSB",
                (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
@@ -1992,12 +2059,14 @@ def fpz64movi : Operand<i64>,
 multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
   def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
                           (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
-                          NoItinerary> {
+                          NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
     let Defs = [NZCV];
   }
 
   def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
-                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary> {
+                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
     let Defs = [NZCV];
   }
 }
@@ -2026,7 +2095,8 @@ class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
                 (outs),
                 (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
                 !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                [], NoItinerary> {
+                [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Defs = [NZCV];
 }
 
@@ -2044,9 +2114,10 @@ let Uses = [NZCV] in {
   def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
                                  (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
                                  "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f32:$Rd, 
+                                 [(set f32:$Rd,
                                        (simple_select f32:$Rn, f32:$Rm))],
-                                 NoItinerary>;
+                                 NoItinerary>,
+                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 
   def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
@@ -2054,7 +2125,8 @@ let Uses = [NZCV] in {
                                  "fcsel\t$Rd, $Rn, $Rm, $Cond",
                                  [(set f64:$Rd,
                                        (simple_select f64:$Rn, f64:$Rm))],
-                                 NoItinerary>;
+                                 NoItinerary>,
+                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2072,18 +2144,22 @@ multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
   def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
                      !strconcat(asmstr, "\t$Rd, $Rn"),
                      [(set f32:$Rd, (opnode f32:$Rn))],
-                     NoItinerary>;
+                     NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
                      !strconcat(asmstr, "\t$Rd, $Rn"),
                      [(set f64:$Rd, (opnode f64:$Rn))],
-                     NoItinerary>;
+                     NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm FMOV   : A64I_fpdp1sizes<0b000000, "fmov">;
 defm FABS   : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
 defm FNEG   : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
-defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
+let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
+  defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
+}
 
 defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
 defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
@@ -2112,7 +2188,8 @@ class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
                {0,0,0,1, DestReg.t1, DestReg.t0},
                (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
                "fcvt\t$Rd, $Rn",
-               [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>;
+               [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
 def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
@@ -2137,18 +2214,22 @@ multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
                       (ins FPR32:$Rn, FPR32:$Rm),
                       !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
                       [(set f32:$Rd, (opnode f32:$Rn, f32:$Rm))],
-                      NoItinerary>;
+                      NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
   def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
                       (outs FPR64:$Rd),
                       (ins FPR64:$Rn, FPR64:$Rm),
                       !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
                       [(set f64:$Rd, (opnode f64:$Rn, f64:$Rm))],
-                      NoItinerary>;
+                      NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 }
 
 let isCommutable = 1 in {
-  defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
+  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
+    defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
+  }
   defm FADD   : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
 
   // No patterns for these.
@@ -2157,12 +2238,16 @@ let isCommutable = 1 in {
   defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
   defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
 
-  defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
-                                PatFrag<(ops node:$lhs, node:$rhs),
-                                        (fneg (fmul node:$lhs, node:$rhs))> >;
+  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
+    defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
+                                  PatFrag<(ops node:$lhs, node:$rhs),
+                                          (fneg (fmul node:$lhs, node:$rhs))> >;
+  }
 }
 
-defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
+let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
+  defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
+}
 defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
 
 //===----------------------------------------------------------------------===//
@@ -2172,9 +2257,9 @@ defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
 
 def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
                     (fma (fneg node:$Rn),  node:$Rm, node:$Ra)>;
-def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
 def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
+def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
                      (fma (fneg node:$Rn),  node:$Rm, (fneg node:$Ra))>;
 
 class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
@@ -2183,7 +2268,8 @@ class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
                (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
                !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
                [(set VT:$Rd, (fmakind VT:$Rn, VT:$Rm, VT:$Ra))],
-               NoItinerary>;
+               NoItinerary>,
+    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]>;
 
 def FMADDssss  : A64I_fpdp3Impl<"fmadd",  FPR32, f32, 0b00, 0b0, 0b0, fma>;
 def FMSUBssss  : A64I_fpdp3Impl<"fmsub",  FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
@@ -2198,22 +2284,22 @@ def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
 // Extra patterns for when we're allowed to optimise separate multiplication and
 // addition.
 let Predicates = [HasFPARMv8, UseFusedMAC] in {
-def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
+def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
           (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
+def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
           (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fmul FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
+def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
           (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
+def : Pat<(f32 (fsub (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
           (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
+def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
           (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
+def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
           (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fmul FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
+def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
           (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
+def : Pat<(f64 (fsub (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
           (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 }
 
@@ -2262,14 +2348,15 @@ class cvtfix_i64_op<ValueType FloatVT>
 // worth going for a multiclass here. Oh well.
 
 class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass GPR, RegisterClass FPR, 
-                   ValueType DstTy, ValueType SrcTy, 
+                   RegisterClass GPR, RegisterClass FPR,
+                   ValueType DstTy, ValueType SrcTy,
                    Operand scale_op, string asmop, SDNode cvtop>
   : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
                  (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
                  !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
                  [(set DstTy:$Rd, (cvtop (fmul SrcTy:$Rn, scale_op:$Scale)))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32, i32, f32,
                              cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
@@ -2298,7 +2385,8 @@ class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
                  (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
                  !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
                  [(set DstTy:$Rd, (fdiv (cvtop SrcTy:$Rn), scale_op:$Scale))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32, f32, i32,
                             cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
@@ -2325,7 +2413,8 @@ def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64, f64, i64,
 class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
                    RegisterClass DestPR, RegisterClass SrcPR, string asmop>
   : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
-               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>;
+               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
   def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
@@ -2411,11 +2500,13 @@ def lane1 : Operand<i32> {
 let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
   def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
                           (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
-                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>;
+                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
 
   def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
                           (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
-                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>;
+                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 let Predicates = [HasFPARMv8] in {
@@ -2462,7 +2553,8 @@ class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
                (ins fmov_operand:$Imm8),
                "fmov\t$Rd, $Imm8",
                [(set VT:$Rd, fmov_operand:$Imm8)],
-               NoItinerary>;
+               NoItinerary>,
+    Sched<[WriteFPALU]>;
 
 def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
 def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
@@ -2511,7 +2603,8 @@ defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
 class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
                       list<dag> patterns = []>
    : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
-                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>;
+                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>,
+     Sched<[WriteLd]>;
 
 let mayLoad = 1 in {
   def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
@@ -2528,17 +2621,18 @@ let mayLoad = 1 in {
   def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
   }
 
-
   def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
                                (outs GPR64:$Rt),
                                (ins ldrlit_label:$Imm19),
                                "ldrsw\t$Rt, $Imm19",
-                               [], NoItinerary>;
+                               [], NoItinerary>,
+                   Sched<[WriteLd]>;
 
   def PRFM_lit : A64I_LDRlit<0b11, 0b0,
                              (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
                              "prfm\t$Rt, $Imm19",
-                             [], NoItinerary>;
+                             [], NoItinerary>,
+                 Sched<[WriteLd, ReadLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2587,24 +2681,29 @@ class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
                         pat, itin> {
   let mayStore = 1;
   let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+  let Constraints = "@earlyclobber $Rs";
 }
 
 multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
   def _byte:  A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>;
+                              [], NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]>;
 
   def _hword:  A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
                                (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                               [],NoItinerary>;
+                               [],NoItinerary>,
+               Sched<[WriteSt, ReadSt, ReadSt]>;
 
   def _word:  A64I_SRexs_impl<0b10, opcode, asmstr,
                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>;
+                              [], NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]>;
 
   def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
                               (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>;
+                              [], NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]>;
 }
 
 defm STXR  : A64I_SRex<"stxr",  0b000, "STXR">;
@@ -2629,19 +2728,23 @@ class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
 multiclass A64I_LRex<string asmstr, bits<3> opcode> {
   def _byte:  A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
                             (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteLd]>;
 
   def _hword:  A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
                             (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteLd]>;
 
   def _word:  A64I_LRexs_impl<0b10, opcode, asmstr,
                             (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteLd]>;
 
   def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
                             (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteLd]>;
 }
 
 defm LDXR  : A64I_LRex<"ldxr",  0b000>;
@@ -2695,22 +2798,26 @@ multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
   def _byte:  A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
                             (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
                             [(atomic_store_release_8 i64:$Rn, i32:$Rt)],
-                            NoItinerary>;
+                            NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]>;
 
   def _hword:  A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
                            [(atomic_store_release_16 i64:$Rn, i32:$Rt)],
-                           NoItinerary>;
+                           NoItinerary>,
+               Sched<[WriteSt, ReadSt, ReadSt]>;
 
   def _word:  A64I_SLexs_impl<0b10, opcode, asmstr,
                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
                            [(atomic_store_release_32 i64:$Rn, i32:$Rt)],
-                           NoItinerary>;
+                           NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]>;
 
   def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
                            (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
                            [(atomic_store_release_64 i64:$Rn, i64:$Rt)],
-                           NoItinerary>;
+                           NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]>;
 }
 
 defm STLR  : A64I_SLex<"stlr", 0b101, "STLR">;
@@ -2735,12 +2842,14 @@ multiclass A64I_SPex<string asmstr, bits<3> opcode> {
   def _word:  A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
                             (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
                                  GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
 
   def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
                             (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
                                             GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
 }
 
 defm STXP  : A64I_SPex<"stxp", 0b010>;
@@ -2767,12 +2876,14 @@ multiclass A64I_LPex<string asmstr, bits<3> opcode> {
   def _word:  A64I_LPexs_impl<0b10, opcode, asmstr,
                             (outs GPR32:$Rt, GPR32:$Rt2),
                             (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteLd, WriteLd, ReadLd]>;
 
   def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
                             (outs GPR64:$Rt, GPR64:$Rt2),
                             (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+              Sched<[WriteLd, WriteLd, ReadLd]>;
 }
 
 defm LDXP  : A64I_LPex<"ldxp", 0b010>;
@@ -2986,7 +3097,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
   def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
                      (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
                      "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                     [], NoItinerary> {
+                     [], NoItinerary>,
+             Sched<[WriteSt, ReadSt, ReadSt]> {
     let mayStore = 1;
   }
   def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
@@ -2995,7 +3107,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
   def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
                       (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
                       "ldr" #  asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                      [], NoItinerary> {
+                      [], NoItinerary>,
+             Sched<[WriteLd, ReadLd]> {
     let mayLoad = 1;
   }
   def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
@@ -3007,13 +3120,15 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
                             (outs GPR:$Rt),
                             (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
                             "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                            Sched<[WriteLd, ReadLd, ReadLd]>;
 
     def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
                             (outs GPR:$Rt),
                             (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
                             "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                            Sched<[WriteLd, ReadLd, ReadLd]>;
   }
   def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
         (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
@@ -3024,13 +3139,15 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
                                   (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
                                                params.regextWm:$Ext),
                                   "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>;
+                                  [], NoItinerary>,
+                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
 
     def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
                                   (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
                                                params.regextXm:$Ext),
                                   "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>;
+                                  [], NoItinerary>,
+                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
   }
   def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
       (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
@@ -3040,7 +3157,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
   def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
                              (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
                              "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary> {
+                             [], NoItinerary>,
+              Sched<[WriteSt, ReadSt, ReadSt]> {
     let mayStore = 1;
   }
   def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
@@ -3049,7 +3167,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
   def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
                              (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
                              "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary> {
+                             [], NoItinerary>,
+              Sched<[WriteLd, ReadLd]> {
     let mayLoad = 1;
   }
   def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
@@ -3060,7 +3179,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
                                (outs GPR64xsp:$Rn_wb),
                                (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
                                "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                               [], NoItinerary> {
+                               [], NoItinerary>,
+                     Sched<[WriteSt, ReadSt, ReadSt]> {
     let Constraints = "$Rn = $Rn_wb";
     let mayStore = 1;
 
@@ -3072,7 +3192,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
                                     (outs GPR:$Rt, GPR64xsp:$Rn_wb),
                                     (ins GPR64xsp:$Rn, simm9:$SImm9),
                                     "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                                    [], NoItinerary> {
+                                    [], NoItinerary>,
+                     Sched<[WriteLd, WriteLd, ReadLd]> {
     let mayLoad = 1;
     let Constraints = "$Rn = $Rn_wb";
     let DecoderMethod = "DecodeSingleIndexedInstruction";
@@ -3083,7 +3204,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
                                (outs GPR64xsp:$Rn_wb),
                                (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
                                "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                               [], NoItinerary> {
+                               [], NoItinerary>,
+                    Sched<[WriteSt, ReadSt, ReadSt]> {
     let Constraints = "$Rn = $Rn_wb";
     let mayStore = 1;
 
@@ -3095,7 +3217,8 @@ multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
                                     (outs GPR:$Rt, GPR64xsp:$Rn_wb),
                                     (ins GPR64xsp:$Rn, simm9:$SImm9),
                                     "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                                    [], NoItinerary> {
+                                    [], NoItinerary>,
+                    Sched<[WriteLd, WriteLd, ReadLd]> {
     let mayLoad = 1;
     let Constraints = "$Rn = $Rn_wb";
     let DecoderMethod = "DecodeSingleIndexedInstruction";
@@ -3155,7 +3278,8 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                           (outs GPR32:$Rt),
                           (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
                           "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary> {
+                          [], NoItinerary>,
+          Sched<[WriteLd, ReadLd]> {
     let mayLoad = 1;
   }
   def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
@@ -3165,7 +3289,8 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                           (outs GPR64:$Rt),
                           (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
                           "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary> {
+                          [], NoItinerary>,
+          Sched<[WriteLd, ReadLd]> {
     let mayLoad = 1;
   }
   def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
@@ -3177,25 +3302,29 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                             (outs GPR32:$Rt),
                             (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
                             "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                         Sched<[WriteLd, ReadLd, ReadLd]>;
 
     def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
                             (outs GPR32:$Rt),
                             (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
                             "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                         Sched<[WriteLd, ReadLd, ReadLd]>;
 
     def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
                             (outs GPR64:$Rt),
                             (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
                             "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                         Sched<[WriteLd, ReadLd, ReadLd]>;
 
     def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
                             (outs GPR64:$Rt),
                             (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
                             "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                         Sched<[WriteLd, ReadLd, ReadLd]>;
   }
   def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
         (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
@@ -3212,13 +3341,15 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                              (outs GPR32:$Rt),
                              (ins GPR64xsp:$Rn, simm9:$SImm9),
                              "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>;
+                             [], NoItinerary>,
+              Sched<[WriteLd, ReadLd]>;
 
     def x_U : A64I_LSunalimm<size, 0b0, 0b10,
                              (outs GPR64:$Rt),
                              (ins GPR64xsp:$Rn, simm9:$SImm9),
                              "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>;
+                             [], NoItinerary>,
+              Sched<[WriteLd, ReadLd]>;
 
 
     // Post-indexed
@@ -3226,7 +3357,8 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                                  (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
                                  (ins GPR64xsp:$Rn, simm9:$SImm9),
                                  "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                 [], NoItinerary> {
+                                 [], NoItinerary>,
+                    Sched<[WriteLd, WriteLd, ReadLd]> {
       let Constraints = "$Rn = $Rn_wb";
       let DecoderMethod = "DecodeSingleIndexedInstruction";
     }
@@ -3235,7 +3367,8 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                                    (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
                                    "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                   [], NoItinerary> {
+                                   [], NoItinerary>,
+                    Sched<[WriteLd, WriteLd, ReadLd]> {
       let Constraints = "$Rn = $Rn_wb";
       let DecoderMethod = "DecodeSingleIndexedInstruction";
     }
@@ -3245,7 +3378,8 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                                  (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
                                  (ins GPR64xsp:$Rn, simm9:$SImm9),
                                  "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary> {
+                                 [], NoItinerary>,
+                   Sched<[WriteLd, WriteLd, ReadLd]> {
       let Constraints = "$Rn = $Rn_wb";
       let DecoderMethod = "DecodeSingleIndexedInstruction";
     }
@@ -3254,7 +3388,8 @@ multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
                                  (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
                                  (ins GPR64xsp:$Rn, simm9:$SImm9),
                                  "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary> {
+                                 [], NoItinerary>,
+                   Sched<[WriteLd, WriteLd, ReadLd]> {
       let Constraints = "$Rn = $Rn_wb";
       let DecoderMethod = "DecodeSingleIndexedInstruction";
     }
@@ -3272,7 +3407,8 @@ def LDRSWx
                     (outs GPR64:$Rt),
                     (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
                     "ldrsw\t$Rt, [$Rn, $UImm12]",
-                    [], NoItinerary> {
+                    [], NoItinerary>,
+      Sched<[WriteLd, ReadLd]> {
   let mayLoad = 1;
 }
 def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
@@ -3282,13 +3418,15 @@ let mayLoad = 1 in {
                              (outs GPR64:$Rt),
                              (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
                              "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>;
+                             [], NoItinerary>,
+                            Sched<[WriteLd, ReadLd, ReadLd]>;
 
   def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
                              (outs GPR64:$Rt),
                              (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
                              "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>;
+                             [], NoItinerary>,
+                            Sched<[WriteLd, ReadLd, ReadLd]>;
 }
 def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
                 (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
@@ -3299,7 +3437,8 @@ def LDURSWx
                     (outs GPR64:$Rt),
                     (ins GPR64xsp:$Rn, simm9:$SImm9),
                     "ldursw\t$Rt, [$Rn, $SImm9]",
-                    [], NoItinerary> {
+                    [], NoItinerary>,
+      Sched<[WriteLd, ReadLd]> {
   let mayLoad = 1;
 }
 def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
@@ -3309,7 +3448,8 @@ def LDRSWx_PostInd
                     (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
                     (ins GPR64xsp:$Rn, simm9:$SImm9),
                     "ldrsw\t$Rt, [$Rn], $SImm9",
-                    [], NoItinerary> {
+                    [], NoItinerary>,
+      Sched<[WriteLd, WriteLd, ReadLd]> {
   let mayLoad = 1;
   let Constraints = "$Rn = $Rn_wb";
   let DecoderMethod = "DecodeSingleIndexedInstruction";
@@ -3319,7 +3459,8 @@ def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
                                  (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
                                  (ins GPR64xsp:$Rn, simm9:$SImm9),
                                  "ldrsw\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary> {
+                                 [], NoItinerary>,
+                    Sched<[WriteLd, WriteLd, ReadLd]> {
   let mayLoad = 1;
   let Constraints = "$Rn = $Rn_wb";
   let DecoderMethod = "DecodeSingleIndexedInstruction";
@@ -3332,7 +3473,8 @@ def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
 def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
                  (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
                  "prfm\t$Rt, [$Rn, $UImm12]",
-                 [], NoItinerary> {
+                 [], NoItinerary>,
+           Sched<[WritePreLd, ReadPreLd]> {
   let mayLoad = 1;
 }
 def : InstAlias<"prfm $Rt, [$Rn]",
@@ -3343,12 +3485,14 @@ let mayLoad = 1 in {
                                         (ins prefetch_op:$Rt, GPR64xsp:$Rn,
                                              GPR32:$Rm, dword_Wm_regext:$Ext),
                                         "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>;
+                                        [], NoItinerary>,
+                          Sched<[WritePreLd, ReadPreLd]>;
   def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
                                         (ins prefetch_op:$Rt, GPR64xsp:$Rn,
                                              GPR64:$Rm, dword_Xm_regext:$Ext),
                                         "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>;
+                                        [], NoItinerary>,
+                          Sched<[WritePreLd, ReadPreLd]>;
 }
 
 def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
@@ -3359,7 +3503,8 @@ def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
 def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
                          (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
                          "prfum\t$Rt, [$Rn, $SImm9]",
-                         [], NoItinerary> {
+                         [], NoItinerary>,
+            Sched<[WritePreLd, ReadPreLd]> {
   let mayLoad = 1;
 }
 def : InstAlias<"prfum $Rt, [$Rn]",
@@ -3379,7 +3524,8 @@ multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
   def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
                               (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
                               "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                              [], NoItinerary> {
+                              [], NoItinerary>,
+                    Sched<[WriteLd, ReadLd]> {
     let mayStore = 1;
   }
 
@@ -3389,7 +3535,8 @@ multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
   def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
                                (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
                                "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                               [], NoItinerary> {
+                               [], NoItinerary>,
+                    Sched<[WriteLd, ReadLd]> {
     let mayLoad = 1;
   }
 
@@ -3418,13 +3565,15 @@ multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
                           (outs GPR32:$Rt),
                           (ins GPR64xsp:$Rn, simm9:$SImm9),
                           "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+            Sched<[WriteLd, ReadLd]>;
 
     def x : A64I_LSunpriv<size, 0b0, 0b10,
                           (outs GPR64:$Rt),
                           (ins GPR64xsp:$Rn, simm9:$SImm9),
                           "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+            Sched<[WriteLd, ReadLd]>;
   }
 
   def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
@@ -3445,7 +3594,8 @@ def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
                             (outs GPR64:$Rt),
                             (ins GPR64xsp:$Rn, simm9:$SImm9),
                             "ldtrsw\t$Rt, [$Rn, $SImm9]",
-                            [], NoItinerary> {
+                            [], NoItinerary>,
+              Sched<[WriteLd, ReadLd]> {
   let mayLoad = 1;
 }
 def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
@@ -3507,7 +3657,8 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
                           Operand simm7, string prefix> {
   def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
                     (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
+             Sched<[WriteLd, ReadLd]> {
     let mayStore = 1;
     let DecoderMethod = "DecodeLDSTPairInstruction";
   }
@@ -3518,7 +3669,8 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
   def _LDR : A64I_LSPoffset<opc, v, 0b1,
                             (outs SomeReg:$Rt, SomeReg:$Rt2),
                             (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
+             Sched<[WriteLd, WriteLd, ReadLd]> {
     let mayLoad = 1;
     let DecoderMethod = "DecodeLDSTPairInstruction";
   }
@@ -3532,7 +3684,8 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
                                     GPR64xsp:$Rn,
                                     simm7:$SImm7),
                                "stp\t$Rt, $Rt2, [$Rn], $SImm7",
-                               [], NoItinerary> {
+                               [], NoItinerary>,
+                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
     let mayStore = 1;
     let Constraints = "$Rn = $Rn_wb";
 
@@ -3544,16 +3697,18 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
                         (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
                         (ins GPR64xsp:$Rn, simm7:$SImm7),
                         "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
-                        [], NoItinerary> {
+                        [], NoItinerary>,
+                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
     let mayLoad = 1;
     let Constraints = "$Rn = $Rn_wb";
     let DecoderMethod = "DecodeLDSTPairInstruction";
   }
 
   def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
-                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                    [], NoItinerary> {
+                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                       "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                       [], NoItinerary>,
+                    Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
     let mayStore = 1;
     let Constraints = "$Rn = $Rn_wb";
     let DecoderMethod = "DecodeLDSTPairInstruction";
@@ -3563,15 +3718,17 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
                               (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
                               (ins GPR64xsp:$Rn, simm7:$SImm7),
                               "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                              [], NoItinerary> {
+                              [], NoItinerary>,
+                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
     let mayLoad = 1;
     let Constraints = "$Rn = $Rn_wb";
     let DecoderMethod = "DecodeLDSTPairInstruction";
   }
 
   def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
-                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                    "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                       "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
+                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
     let mayStore = 1;
     let DecoderMethod = "DecodeLDSTPairInstruction";
   }
@@ -3582,7 +3739,8 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
   def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
                             (outs SomeReg:$Rt, SomeReg:$Rt2),
                             (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
+                     Sched<[WriteLd, WriteLd, ReadLd]> {
     let mayLoad = 1;
     let DecoderMethod = "DecodeLDSTPairInstruction";
   }
@@ -3607,7 +3765,8 @@ defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
 def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
                            (outs GPR64:$Rt, GPR64:$Rt2),
                            (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
+             Sched<[WriteLd, WriteLd, ReadLd]> {
   let mayLoad = 1;
   let DecoderMethod = "DecodeLDSTPairInstruction";
 }
@@ -3618,7 +3777,8 @@ def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
                                   (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
                                   (ins GPR64xsp:$Rn, word_simm7:$SImm7),
                                   "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
-                                  [], NoItinerary> {
+                                  [], NoItinerary>,
+                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
   let mayLoad = 1;
   let Constraints = "$Rn = $Rn_wb";
   let DecoderMethod = "DecodeLDSTPairInstruction";
@@ -3628,7 +3788,8 @@ def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
                                    (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
                                    (ins GPR64xsp:$Rn, word_simm7:$SImm7),
                                    "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                                   [], NoItinerary> {
+                                   [], NoItinerary>,
+                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
   let mayLoad = 1;
   let Constraints = "$Rn = $Rn_wb";
   let DecoderMethod = "DecodeLDSTPairInstruction";
@@ -3673,14 +3834,16 @@ multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
                          [(set i32:$Rd,
                                (opnode i32:$Rn, logical_imm32_operand:$Imm))],
-                         NoItinerary>;
+                         NoItinerary>,
+            Sched<[WriteALU, ReadALU]>;
 
   def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
                          (ins GPR64:$Rn, logical_imm64_operand:$Imm),
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
                          [(set i64:$Rd,
                                (opnode i64:$Rn, logical_imm64_operand:$Imm))],
-                         NoItinerary>;
+                         NoItinerary>,
+            Sched<[WriteALU, ReadALU]>;
 }
 
 defm AND : A64I_logimmSizes<0b00, "and", and>;
@@ -3691,12 +3854,14 @@ let Defs = [NZCV] in {
   def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
                                 (ins GPR32:$Rn, logical_imm32_operand:$Imm),
                                 "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>;
+                                [], NoItinerary>,
+                Sched<[WriteALU, ReadALU]>;
 
   def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
                                 (ins GPR64:$Rn, logical_imm64_operand:$Imm),
                                 "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>;
+                                [], NoItinerary>,
+                Sched<[WriteALU, ReadALU]>;
 }
 
 
@@ -3741,7 +3906,8 @@ multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
                        [(set ty:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
                             !cast<Operand>("lsl_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
                        (outs GPR:$Rd),
@@ -3751,7 +3917,8 @@ multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
                        [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
                             !cast<Operand>("lsr_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _asr : A64I_logicalshift<sf, opc, 0b10, N,
                        (outs GPR:$Rd),
@@ -3761,7 +3928,8 @@ multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
                        [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
                             !cast<Operand>("asr_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _ror : A64I_logicalshift<sf, opc, 0b11, N,
                        (outs GPR:$Rd),
@@ -3771,7 +3939,8 @@ multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
                        [(set ty:$Rd, (opfrag ty:$Rn, (rotr ty:$Rm,
                             !cast<Operand>("ror_operand_" # ty):$Imm6))
                        )],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
   }
 
   def _noshift
@@ -3826,7 +3995,8 @@ multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        [(set NZCV, (A64setcc (and ty:$Rn, (shl ty:$Rm,
                            !cast<Operand>("lsl_operand_" # ty):$Imm6)),
                                           0, signed_cond))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
 
   def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
@@ -3837,7 +4007,8 @@ multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        [(set NZCV, (A64setcc (and ty:$Rn, (srl ty:$Rm,
                            !cast<Operand>("lsr_operand_" # ty):$Imm6)),
                                           0, signed_cond))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
                        (outs),
@@ -3847,7 +4018,8 @@ multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        [(set NZCV, (A64setcc (and ty:$Rn, (sra ty:$Rm,
                            !cast<Operand>("asr_operand_" # ty):$Imm6)),
                                           0, signed_cond))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
                        (outs),
@@ -3857,7 +4029,8 @@ multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        [(set NZCV, (A64setcc (and ty:$Rn, (rotr ty:$Rm,
                            !cast<Operand>("ror_operand_" # ty):$Imm6)),
                                           0, signed_cond))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
   }
 
   def _noshift : InstAlias<"tst $Rn, $Rm",
@@ -3880,7 +4053,8 @@ multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        "mvn\t$Rd, $Rm, $Imm6",
                        [(set ty:$Rd, (not (shl ty:$Rm,
                          !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
 
   def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
@@ -3890,7 +4064,8 @@ multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        "mvn\t$Rd, $Rm, $Imm6",
                        [(set ty:$Rd, (not (srl ty:$Rm,
                          !cast<Operand>("lsr_operand_" # ty):$Imm6)))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
                        (outs GPR:$Rd),
@@ -3899,7 +4074,8 @@ multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        "mvn\t$Rd, $Rm, $Imm6",
                        [(set ty:$Rd, (not (sra ty:$Rm,
                          !cast<Operand>("asr_operand_" # ty):$Imm6)))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
 
   def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
                        (outs GPR:$Rd),
@@ -3908,7 +4084,8 @@ multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
                        "mvn\t$Rd, $Rm, $Imm6",
                        [(set ty:$Rd, (not (rotr ty:$Rm,
                          !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>;
+                       NoItinerary>,
+             Sched<[WriteALU, ReadALU, ReadALU]>;
   }
 
   def _noshift : InstAlias<"mvn $Rn, $Rm",
@@ -3963,7 +4140,8 @@ multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
 
   def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
                       !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary> {
+                      [], NoItinerary>,
+            Sched<[WriteALU]> {
     bits<18> FullImm;
     let UImm16 = FullImm{15-0};
     let Shift = FullImm{17-16};
@@ -3971,7 +4149,8 @@ multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
 
   def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
                       !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary> {
+                      [], NoItinerary>,
+            Sched<[WriteALU]> {
     bits<18> FullImm;
     let UImm16 = FullImm{15-0};
     let Shift = FullImm{17-16};
@@ -3993,7 +4172,8 @@ let isMoveImm = 1, isReMaterializable = 1,
                              (ins movz64_imm:$FullImm)>;
 }
 
-let Constraints = "$src = $Rd" in
+let Constraints = "$src = $Rd",
+    SchedRW = [WriteALU, ReadALU] in
 defm MOVK : A64I_movwSizes<0b11, "movk",
                            (ins GPR32:$src, movk32_imm:$FullImm),
                            (ins GPR64:$src, movk64_imm:$FullImm)>;
@@ -4079,10 +4259,12 @@ def adrp_label : Operand<i64> {
 
 let hasSideEffects = 0 in {
   def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
-                         "adr\t$Rd, $Label", [], NoItinerary>;
+                         "adr\t$Rd, $Label", [], NoItinerary>,
+              Sched<[WriteALUs]>;
 
   def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
-                          "adrp\t$Rd, $Label", [], NoItinerary>;
+                          "adrp\t$Rd, $Label", [], NoItinerary>,
+               Sched<[WriteALUs]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4368,14 +4550,16 @@ let isBranch = 1, isTerminator = 1 in {
                         "tbz\t$Rt, $Imm, $Label",
                         [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
                                    A64eq, bb:$Label)],
-                        NoItinerary>;
+                        NoItinerary>,
+               Sched<[WriteBr]>;
 
   def TBNZxii : A64I_TBimm<0b1, (outs),
                         (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
                         "tbnz\t$Rt, $Imm, $Label",
                         [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
                                    A64ne, bb:$Label)],
-                        NoItinerary>;
+                        NoItinerary>,
+                Sched<[WriteBr]>;
 
 
   // Note, these instructions overlap with the above 64-bit patterns. This is
@@ -4387,7 +4571,8 @@ let isBranch = 1, isTerminator = 1 in {
                         "tbz\t$Rt, $Imm, $Label",
                         [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
                                    A64eq, bb:$Label)],
-                        NoItinerary> {
+                        NoItinerary>,
+               Sched<[WriteBr]> {
     let Imm{5} = 0b0;
   }
 
@@ -4396,7 +4581,8 @@ let isBranch = 1, isTerminator = 1 in {
                         "tbnz\t$Rt, $Imm, $Label",
                         [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
                                    A64ne, bb:$Label)],
-                        NoItinerary> {
+                        NoItinerary>,
+                Sched<[WriteBr]> {
     let Imm{5} = 0b0;
   }
 }
@@ -4431,7 +4617,8 @@ def blimm_target : Operand<i64> {
 class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
   : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
               !strconcat(asmop, "\t$Label"), patterns,
-              NoItinerary>;
+              NoItinerary>,
+    Sched<[WriteBr]>;
 
 let isBranch = 1 in {
   def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
@@ -4439,10 +4626,12 @@ let isBranch = 1 in {
     let isBarrier = 1;
   }
 
-  def BLimm : A64I_BimmImpl<0b1, "bl",
-                            [(AArch64Call tglobaladdr:$Label)], blimm_target> {
-    let isCall = 1;
-    let Defs = [X30];
+  let SchedRW = [WriteBrL] in {
+    def BLimm : A64I_BimmImpl<0b1, "bl",
+                              [(AArch64Call tglobaladdr:$Label)], blimm_target> {
+      let isCall = 1;
+      let Defs = [X30];
+    }
   }
 }
 
@@ -4459,7 +4648,8 @@ class A64I_BregImpl<bits<4> opc,
                     dag outs, dag ins, string asmstr, list<dag> patterns,
                     InstrItinClass itin = NoItinerary>
   : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
-              outs, ins, asmstr, patterns, itin> {
+              outs, ins, asmstr, patterns, itin>,
+    Sched<[WriteBr]> {
   let isBranch         = 1;
   let isIndirectBranch = 1;
 }
@@ -4475,11 +4665,13 @@ let isBranch = 1 in {
     let isTerminator = 1;
   }
 
-  def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
-                           "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
-    let isBarrier = 0;
-    let isCall = 1;
-    let Defs = [X30];
+  let SchedRW = [WriteBrL] in {
+    def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
+                             "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
+      let isBarrier = 0;
+      let isCall = 1;
+      let Defs = [X30];
+    }
   }
 
   def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
@@ -4539,6 +4731,7 @@ def : ADRP_ADD<A64WrapperSmall, texternalsym>;
 def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
 def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
 def : ADRP_ADD<A64WrapperSmall, tjumptable>;
+def : ADRP_ADD<A64WrapperSmall, tconstpool>;
 
 //===----------------------------------------------------------------------===//
 // GOT access patterns
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index fe73a05..0b97e3b 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -14,9 +14,6 @@
 //===----------------------------------------------------------------------===//
 // NEON-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
-def Neon_bsl       : SDNode<"AArch64ISD::NEON_BSL", SDTypeProfile<1, 3,
-                      [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                      SDTCisSameAs<0, 3>]>>;
 
 // (outs Result), (ins Imm, OpCmode)
 def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
@@ -67,10 +64,49 @@ def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
                            [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
                            SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
 
-def SDT_assertext : SDTypeProfile<1, 1,
-  [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
-def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
-def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;
+//===----------------------------------------------------------------------===//
+// Addressing-mode instantiations
+//===----------------------------------------------------------------------===//
+
+multiclass ls_64_pats<dag address, dag Base, dag Offset, ValueType Ty> {
+defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
+                      !foreach(decls.pattern, Offset,
+                               !subst(OFFSET, dword_uimm12, decls.pattern)),
+                      !foreach(decls.pattern, address,
+                               !subst(OFFSET, dword_uimm12,
+                               !subst(ALIGN, min_align8, decls.pattern))),
+                      Ty>;
+}
+
+multiclass ls_128_pats<dag address, dag Base, dag Offset, ValueType Ty> {
+defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, qword_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, qword_uimm12,
+                                !subst(ALIGN, min_align16, decls.pattern))),
+                      Ty>;
+}
+
+multiclass uimm12_neon_pats<dag address, dag Base, dag Offset> {
+  defm : ls_64_pats<address, Base, Offset, v8i8>;
+  defm : ls_64_pats<address, Base, Offset, v4i16>;
+  defm : ls_64_pats<address, Base, Offset, v2i32>;
+  defm : ls_64_pats<address, Base, Offset, v1i64>;
+  defm : ls_64_pats<address, Base, Offset, v2f32>;
+  defm : ls_64_pats<address, Base, Offset, v1f64>;
+
+  defm : ls_128_pats<address, Base, Offset, v16i8>;
+  defm : ls_128_pats<address, Base, Offset, v8i16>;
+  defm : ls_128_pats<address, Base, Offset, v4i32>;
+  defm : ls_128_pats<address, Base, Offset, v2i64>;
+  defm : ls_128_pats<address, Base, Offset, v4f32>;
+  defm : ls_128_pats<address, Base, Offset, v2f64>;
+}
+
+defm : uimm12_neon_pats<(A64WrapperSmall
+                          tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
+                        (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
 
 //===----------------------------------------------------------------------===//
 // Multiclasses
@@ -86,14 +122,16 @@ multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
                asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
                [(set (v8i8 VPR64:$Rd),
                   (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>;
+               NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _16B : NeonI_3VSame<0b1, u, size, opcode,
                (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
                asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
                [(set (v16i8 VPR128:$Rd),
                   (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>;
+               NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 
 }
@@ -107,28 +145,32 @@ multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
               asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
               [(set (v4i16 VPR64:$Rd),
                  (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
-              NoItinerary>;
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
               asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
               [(set (v8i16 VPR128:$Rd),
                  (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
-              NoItinerary>;
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
               asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
               [(set (v2i32 VPR64:$Rd),
                  (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
-              NoItinerary>;
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
               asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
               [(set (v4i32 VPR128:$Rd),
                  (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
-              NoItinerary>;
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
@@ -141,14 +183,16 @@ multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
                asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
                [(set (v8i8 VPR64:$Rd),
                   (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>;
+               NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
                (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
                asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
                [(set (v16i8 VPR128:$Rd),
                   (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>;
+               NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -162,16 +206,15 @@ multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
               asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
               [(set (v2i64 VPR128:$Rd),
                  (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
-              NoItinerary>;
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
 // Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
 // but Result types can be integer or floating point types.
 multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
-                                 string asmop, SDPatternOperator opnode2S,
-                                 SDPatternOperator opnode4S,
-                                 SDPatternOperator opnode2D,
+                                 string asmop, SDPatternOperator opnode,
                                  ValueType ResTy2S, ValueType ResTy4S,
                                  ValueType ResTy2D, bit Commutable = 0> {
   let isCommutable = Commutable in {
@@ -179,22 +222,25 @@ multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
               asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
               [(set (ResTy2S VPR64:$Rd),
-                 (ResTy2S (opnode2S (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
-              NoItinerary>;
+                 (ResTy2S (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
               asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
               [(set (ResTy4S VPR128:$Rd),
-                 (ResTy4S (opnode4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
-              NoItinerary>;
+                 (ResTy4S (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
               asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
               [(set (ResTy2D VPR128:$Rd),
-                 (ResTy2D (opnode2D (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
-               NoItinerary>;
+                 (ResTy2D (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
+              NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -207,25 +253,80 @@ multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
 // Vector Add (Integer and Floating-Point)
 
 defm ADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
-defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd, fadd, fadd,
+defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd,
                                      v2f32, v4f32, v2f64, 1>;
 
+// Patterns to match add of v1i8/v1i16/v1i32 types
+def : Pat<(v1i8 (add FPR8:$Rn, FPR8:$Rm)),
+          (EXTRACT_SUBREG
+              (ADDvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
+              sub_8)>;
+def : Pat<(v1i16 (add FPR16:$Rn, FPR16:$Rm)),
+          (EXTRACT_SUBREG
+              (ADDvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
+              sub_16)>;
+def : Pat<(v1i32 (add FPR32:$Rn, FPR32:$Rm)),
+          (EXTRACT_SUBREG
+              (ADDvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
+              sub_32)>;
+
 // Vector Sub (Integer and Floating-Point)
 
 defm SUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
-defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub, fsub, fsub,
+defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub,
                                      v2f32, v4f32, v2f64, 0>;
 
+// Patterns to match sub of v1i8/v1i16/v1i32 types
+def : Pat<(v1i8 (sub FPR8:$Rn, FPR8:$Rm)),
+          (EXTRACT_SUBREG
+              (SUBvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
+              sub_8)>;
+def : Pat<(v1i16 (sub FPR16:$Rn, FPR16:$Rm)),
+          (EXTRACT_SUBREG
+              (SUBvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
+              sub_16)>;
+def : Pat<(v1i32 (sub FPR32:$Rn, FPR32:$Rm)),
+          (EXTRACT_SUBREG
+              (SUBvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
+              sub_32)>;
+
 // Vector Multiply (Integer and Floating-Point)
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm MULvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
-defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul, fmul, fmul,
+defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul,
                                      v2f32, v4f32, v2f64, 1>;
+}
+
+// Patterns to match mul of v1i8/v1i16/v1i32 types
+def : Pat<(v1i8 (mul FPR8:$Rn, FPR8:$Rm)),
+          (EXTRACT_SUBREG 
+              (MULvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
+              sub_8)>;
+def : Pat<(v1i16 (mul FPR16:$Rn, FPR16:$Rm)),
+          (EXTRACT_SUBREG 
+              (MULvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
+              sub_16)>;
+def : Pat<(v1i32 (mul FPR32:$Rn, FPR32:$Rm)),
+          (EXTRACT_SUBREG
+              (MULvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
+              sub_32)>;
 
 // Vector Multiply (Polynomial)
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
                                     int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
+}
 
 // Vector Multiply-accumulate and Multiply-subtract (Integer)
 
@@ -239,7 +340,8 @@ class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
     asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
     [(set (OpTy VPRC:$Rd),
        (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
-    NoItinerary> {
+    NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -250,6 +352,7 @@ def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
                        (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
 
 
+let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
 def MLAvvv_8B:  NeonI_3VSame_Constraint_impl<"mla", ".8b",  VPR64,  v8i8,
                                              0b0, 0b0, 0b00, 0b10010, Neon_mla>;
 def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
@@ -275,16 +378,18 @@ def MLSvvv_2S:  NeonI_3VSame_Constraint_impl<"mls", ".2s",  VPR64,  v2i32,
                                              0b0, 0b1, 0b10, 0b10010, Neon_mls>;
 def MLSvvv_4S:  NeonI_3VSame_Constraint_impl<"mls", ".4s",  VPR128, v4i32,
                                              0b1, 0b1, 0b10, 0b10010, Neon_mls>;
+}
 
 // Vector Multiply-accumulate and Multiply-subtract (Floating Point)
 
 def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fadd node:$Ra, (fmul node:$Rn, node:$Rm))>;
+                        (fadd node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
 
 def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fsub node:$Ra, (fmul node:$Rn, node:$Rm))>;
+                        (fsub node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
 
-let Predicates = [HasNEON, UseFusedMAC] in {
+let Predicates = [HasNEON, UseFusedMAC],
+    SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
 def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s",  VPR64,  v2f32,
                                              0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
 def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s",  VPR128, v4f32,
@@ -318,8 +423,10 @@ def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
 
 // Vector Divide (Floating-Point)
 
-defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv, fdiv, fdiv,
+let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
+defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv,
                                      v2f32, v4f32, v2f64, 0>;
+}
 
 // Vector Bitwise Operations
 
@@ -407,26 +514,38 @@ defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
 
 //   Vector Bitwise Select
 def BSLvvv_8B  : NeonI_3VSame_Constraint_impl<"bsl", ".8b",  VPR64, v8i8,
-                                              0b0, 0b1, 0b01, 0b00011, Neon_bsl>;
+                                              0b0, 0b1, 0b01, 0b00011, vselect>;
 
 def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
-                                              0b1, 0b1, 0b01, 0b00011, Neon_bsl>;
+                                              0b1, 0b1, 0b01, 0b00011, vselect>;
 
 multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
                                    Instruction INST8B,
                                    Instruction INST16B> {
   // Disassociate type from instruction definition
-  def : Pat<(v2i32 (opnode VPR64:$src,VPR64:$Rn, VPR64:$Rm)),
+  def : Pat<(v8i8 (opnode (v8i8 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)),
+  def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)),
+  def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i32 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+  def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v8i16 (opnode (v8i16 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
             (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+  def : Pat<(v2f64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
             (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+  def : Pat<(v4f32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
             (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
 
   // Allow to match BSL instruction pattern with non-constant operand
@@ -495,10 +614,10 @@ multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
 }
 
 // Additional patterns for bitwise instruction BSL
-defm: Neon_bitwise3V_patterns<Neon_bsl, BSLvvv_8B, BSLvvv_16B>;
+defm: Neon_bitwise3V_patterns<vselect, BSLvvv_8B, BSLvvv_16B>;
 
 def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
-                           (Neon_bsl node:$src, node:$Rn, node:$Rm),
+                           (vselect node:$src, node:$Rn, node:$Rm),
                            [{ (void)N; return false; }]>;
 
 // Vector Bitwise Insert if True
@@ -557,20 +676,16 @@ defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds,
 
 // Vector Absolute Difference (Floating Point)
 defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
-                                    int_arm_neon_vabds, int_arm_neon_vabds,
                                     int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
 
 // Vector Reciprocal Step (Floating Point)
 defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
-                                       int_arm_neon_vrecps, int_arm_neon_vrecps,
                                        int_arm_neon_vrecps,
                                        v2f32, v4f32, v2f64, 0>;
 
 // Vector Reciprocal Square Root Step (Floating Point)
 defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
                                         int_arm_neon_vrsqrts,
-                                        int_arm_neon_vrsqrts,
-                                        int_arm_neon_vrsqrts,
                                         v2f32, v4f32, v2f64, 0>;
 
 // Vector Comparisons
@@ -677,49 +792,56 @@ multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
              asmop # "\t$Rd.8b, $Rn.8b, $Imm",
              [(set (v8i8 VPR64:$Rd),
                 (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>;
+             NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
              (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
              asmop # "\t$Rd.16b, $Rn.16b, $Imm",
              [(set (v16i8 VPR128:$Rd),
                 (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>;
+             NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
             (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
             asmop # "\t$Rd.4h, $Rn.4h, $Imm",
             [(set (v4i16 VPR64:$Rd),
                (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>;
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
             asmop # "\t$Rd.8h, $Rn.8h, $Imm",
             [(set (v8i16 VPR128:$Rd),
                (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>;
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
             (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
             asmop # "\t$Rd.2s, $Rn.2s, $Imm",
             [(set (v2i32 VPR64:$Rd),
                (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>;
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
             asmop # "\t$Rd.4s, $Rn.4s, $Imm",
             [(set (v4i32 VPR128:$Rd),
                (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>;
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
             asmop # "\t$Rd.2d, $Rn.2d, $Imm",
             [(set (v2i64 VPR128:$Rd),
                (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>;
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 // Vector Compare Mask Equal to Zero (Integer)
@@ -742,18 +864,15 @@ defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
 // Vector Compare Mask Equal (Floating Point)
 let isCommutable =1 in {
 defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
-                                      Neon_cmeq, Neon_cmeq,
                                       v2i32, v4i32, v2i64, 0>;
 }
 
 // Vector Compare Mask Greater Than Or Equal (Floating Point)
 defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
-                                      Neon_cmge, Neon_cmge,
                                       v2i32, v4i32, v2i64, 0>;
 
 // Vector Compare Mask Greater Than (Floating Point)
 defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
-                                      Neon_cmgt, Neon_cmgt,
                                       v2i32, v4i32, v2i64, 0>;
 
 // Vector Compare Mask Less Than Or Equal (Floating Point)
@@ -768,30 +887,45 @@ def FCMLTvvv_2S  : NeonI_compare_aliases<"fcmlt", ".2s",  FCMGTvvv_2S,  VPR64>;
 def FCMLTvvv_4S  : NeonI_compare_aliases<"fcmlt", ".4s",  FCMGTvvv_4S,  VPR128>;
 def FCMLTvvv_2D  : NeonI_compare_aliases<"fcmlt", ".2d",  FCMGTvvv_2D,  VPR128>;
 
+def fpzero_izero_asmoperand : AsmOperandClass {
+  let Name = "FPZeroIZero";
+  let ParserMethod = "ParseFPImm0AndImm0Operand";
+  let DiagnosticType = "FPZero";
+}
+
+def fpzz32 : Operand<f32>,
+             ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_izero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
+}
 
 multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
                               string asmop, CondCode CC>
 {
   def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, fpz32:$FPImm),
+            (outs VPR64:$Rd), (ins VPR64:$Rn, fpzz32:$FPImm),
             asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
             [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpimm:$FPImm), CC)))],
-            NoItinerary>;
+               (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpzz32:$FPImm), CC)))],
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm),
+            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
             asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
             [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))],
-            NoItinerary>;
+               (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm),
+            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
             asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
             [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))],
-            NoItinerary>;
+               (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
+            NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 // Vector Compare Mask Equal to Zero (Floating Point)
@@ -813,14 +947,12 @@ defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
 
 // Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
 defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
-                                      int_arm_neon_vacged, int_arm_neon_vacgeq,
-                                      int_aarch64_neon_vacgeq,
+                                      int_arm_neon_vacge,
                                       v2i32, v4i32, v2i64, 0>;
 
 // Vector Absolute Compare Mask Greater Than (Floating Point)
 defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
-                                      int_arm_neon_vacgtd, int_arm_neon_vacgtq,
-                                      int_aarch64_neon_vacgtq,
+                                      int_arm_neon_vacgt,
                                       v2i32, v4i32, v2i64, 0>;
 
 // Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
@@ -899,26 +1031,22 @@ defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu,
 
 // Vector Maximum (Floating Point)
 defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
-                                     int_arm_neon_vmaxs, int_arm_neon_vmaxs,
-                                     int_arm_neon_vmaxs, v2f32, v4f32, v2f64, 1>;
+                                     int_arm_neon_vmaxs,
+                                     v2f32, v4f32, v2f64, 1>;
 
 // Vector Minimum (Floating Point)
 defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
-                                     int_arm_neon_vmins, int_arm_neon_vmins,
-                                     int_arm_neon_vmins, v2f32, v4f32, v2f64, 1>;
+                                     int_arm_neon_vmins,
+                                     v2f32, v4f32, v2f64, 1>;
 
 // Vector maxNum (Floating Point) -  prefer a number over a quiet NaN)
 defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
                                        int_aarch64_neon_vmaxnm,
-                                       int_aarch64_neon_vmaxnm,
-                                       int_aarch64_neon_vmaxnm,
                                        v2f32, v4f32, v2f64, 1>;
 
 // Vector minNum (Floating Point) - prefer a number over a quiet NaN)
 defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
                                        int_aarch64_neon_vminnm,
-                                       int_aarch64_neon_vminnm,
-                                       int_aarch64_neon_vminnm,
                                        v2f32, v4f32, v2f64, 1>;
 
 // Vector Maximum Pairwise (Signed and Unsigned Integer)
@@ -931,26 +1059,20 @@ defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpmin
 
 // Vector Maximum Pairwise (Floating Point)
 defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
-                                     int_arm_neon_vpmaxs, int_arm_neon_vpmaxs,
                                      int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
 
 // Vector Minimum Pairwise (Floating Point)
 defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
-                                     int_arm_neon_vpmins, int_arm_neon_vpmins,
                                      int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
 
 // Vector maxNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
 defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
                                        int_aarch64_neon_vpmaxnm,
-                                       int_aarch64_neon_vpmaxnm,
-                                       int_aarch64_neon_vpmaxnm,
                                        v2f32, v4f32, v2f64, 1>;
 
 // Vector minNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
 defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
                                        int_aarch64_neon_vpminnm,
-                                       int_aarch64_neon_vpminnm,
-                                       int_aarch64_neon_vpminnm,
                                        v2f32, v4f32, v2f64, 1>;
 
 // Vector Addition Pairwise (Integer)
@@ -959,10 +1081,9 @@ defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>
 // Vector Addition Pairwise (Floating Point)
 defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
                                        int_arm_neon_vpadd,
-                                       int_arm_neon_vpadd,
-                                       int_arm_neon_vpadd,
                                        v2f32, v4f32, v2f64, 1>;
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 // Vector Saturating Doubling Multiply High
 defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
                     int_arm_neon_vqdmulh, 1>;
@@ -974,9 +1095,22 @@ defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
 // Vector Multiply Extended (Floating Point)
 defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
                                       int_aarch64_neon_vmulx,
-                                      int_aarch64_neon_vmulx,
-                                      int_aarch64_neon_vmulx,
                                       v2f32, v4f32, v2f64, 1>;
+}
+
+// Patterns to match llvm.aarch64.* intrinsic for 
+// ADDP, SMINP, UMINP, SMAXP, UMAXP having i32 as output
+class Neon_VectorPair_v2i32_pattern<SDPatternOperator opnode, Instruction INST>
+  : Pat<(v1i32 (opnode (v2i32 VPR64:$Rn))),
+        (EXTRACT_SUBREG
+             (v2i32 (INST (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rn))),
+             sub_32)>;
+
+def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_sminv, SMINPvvv_2S>;
+def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_uminv, UMINPvvv_2S>;
+def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_smaxv, SMAXPvvv_2S>;
+def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_umaxv, UMAXPvvv_2S>;
+def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_vaddv, ADDP_2S>;
 
 // Vector Immediate Instructions
 
@@ -1102,7 +1236,8 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               [(set (v2i32 VPR64:$Rd),
                                  (v2i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary> {
+                              NoItinerary>,
+               Sched<[WriteFPALU]> {
        bits<2> Simm;
        let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
      }
@@ -1115,7 +1250,8 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               [(set (v4i32 VPR128:$Rd),
                                  (v4i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary> {
+                              NoItinerary>,
+               Sched<[WriteFPALU]> {
       bits<2> Simm;
       let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
     }
@@ -1129,7 +1265,8 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               [(set (v4i16 VPR64:$Rd),
                                  (v4i16 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary> {
+                              NoItinerary>,
+               Sched<[WriteFPALU]> {
       bit  Simm;
       let cmode = {0b1, 0b0, Simm, 0b0};
     }
@@ -1142,7 +1279,8 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               [(set (v8i16 VPR128:$Rd),
                                  (v8i16 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary> {
+                              NoItinerary>,
+               Sched<[WriteFPALU]> {
       bit Simm;
       let cmode = {0b1, 0b0, Simm, 0b0};
      }
@@ -1161,9 +1299,10 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                  [(set (v2i32 VPR64:$Rd),
                     (v2i32 (opnode (v2i32 VPR64:$src),
-                      (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))))],
-                 NoItinerary> {
+                      (v2i32 (neonopnode timm:$Imm,
+                        neon_mov_imm_LSL_operand:$Simm)))))],
+                 NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]> {
       bits<2> Simm;
       let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
     }
@@ -1175,9 +1314,10 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                  [(set (v4i32 VPR128:$Rd),
                     (v4i32 (opnode (v4i32 VPR128:$src),
-                      (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))))],
-                 NoItinerary> {
+                      (v4i32 (neonopnode timm:$Imm,
+                        neon_mov_imm_LSL_operand:$Simm)))))],
+                 NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]> {
       bits<2> Simm;
       let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
     }
@@ -1190,9 +1330,10 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
                  [(set (v4i16 VPR64:$Rd),
                     (v4i16 (opnode (v4i16 VPR64:$src),
-                       (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm,
-                          neon_mov_imm_LSL_operand:$Simm)))))))],
-                 NoItinerary> {
+                       (v4i16 (neonopnode timm:$Imm,
+                          neon_mov_imm_LSL_operand:$Simm)))))],
+                 NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]> {
       bit  Simm;
       let cmode = {0b1, 0b0, Simm, 0b1};
     }
@@ -1204,9 +1345,10 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
                  [(set (v8i16 VPR128:$Rd),
                     (v8i16 (opnode (v8i16 VPR128:$src),
-                      (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))))],
-                 NoItinerary> {
+                      (v8i16 (neonopnode timm:$Imm,
+                        neon_mov_imm_LSL_operand:$Simm)))))],
+                 NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]> {
       bit Simm;
       let cmode = {0b1, 0b0, Simm, 0b1};
     }
@@ -1225,7 +1367,8 @@ multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
                               [(set (v2i32 VPR64:$Rd),
                                  (v2i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_MSL_operand:$Simm))))],
-                             NoItinerary> {
+                             NoItinerary>,
+               Sched<[WriteFPALU]> {
        bit Simm;
        let cmode = {0b1, 0b1, 0b0, Simm};
      }
@@ -1238,7 +1381,8 @@ multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
                               [(set (v4i32 VPR128:$Rd),
                                  (v4i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_MSL_operand:$Simm))))],
-                              NoItinerary> {
+                              NoItinerary>,
+              Sched<[WriteFPALU]> {
      bit Simm;
      let cmode = {0b1, 0b1, 0b0, Simm};
    }
@@ -1291,30 +1435,70 @@ def neon_mov_imm_LSLH_transform_operand
     return (HasShift && !ShiftOnesIn); }],
   neon_mov_imm_LSLH_transform_XFORM>;
 
-// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0x00, LSL 8)
-// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0x00)
+// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0xff, LSL 8)
+// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0xff)
 def : Pat<(v4i16 (and VPR64:$src,
-            (v4i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_4H VPR64:$src, 0,
+            (v4i16 (Neon_movi 255,
+              neon_mov_imm_LSLH_transform_operand:$Simm)))),
+          (BICvi_lsl_4H VPR64:$src, 255,
             neon_mov_imm_LSLH_transform_operand:$Simm)>;
 
-// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0x00, LSL 8)
-// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0x00)
+// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0xff, LSL 8)
+// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0xff)
 def : Pat<(v8i16 (and VPR128:$src,
-            (v8i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_8H VPR128:$src, 0,
+            (v8i16 (Neon_movi 255,
+              neon_mov_imm_LSLH_transform_operand:$Simm)))),
+          (BICvi_lsl_8H VPR128:$src, 255,
             neon_mov_imm_LSLH_transform_operand:$Simm)>;
 
+def : Pat<(v8i8 (and VPR64:$src,
+                  (bitconvert(v4i16 (Neon_movi 255,
+                    neon_mov_imm_LSLH_transform_operand:$Simm))))),
+          (BICvi_lsl_4H VPR64:$src, 255,
+            neon_mov_imm_LSLH_transform_operand:$Simm)>;
+def : Pat<(v2i32 (and VPR64:$src,
+                 (bitconvert(v4i16 (Neon_movi 255,
+                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
+          (BICvi_lsl_4H VPR64:$src, 255,
+            neon_mov_imm_LSLH_transform_operand:$Simm)>;
+def : Pat<(v1i64 (and VPR64:$src,
+                (bitconvert(v4i16 (Neon_movi 255,
+                  neon_mov_imm_LSLH_transform_operand:$Simm))))),
+        (BICvi_lsl_4H VPR64:$src, 255,
+          neon_mov_imm_LSLH_transform_operand:$Simm)>;
+
+def : Pat<(v16i8 (and VPR128:$src,
+                 (bitconvert(v8i16 (Neon_movi 255,
+                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
+        (BICvi_lsl_8H VPR128:$src, 255,
+          neon_mov_imm_LSLH_transform_operand:$Simm)>;
+def : Pat<(v4i32 (and VPR128:$src,
+                 (bitconvert(v8i16 (Neon_movi 255,
+                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
+        (BICvi_lsl_8H VPR128:$src, 255,
+          neon_mov_imm_LSLH_transform_operand:$Simm)>;
+def : Pat<(v2i64 (and VPR128:$src,
+                 (bitconvert(v8i16 (Neon_movi 255,
+                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
+        (BICvi_lsl_8H VPR128:$src, 255,
+          neon_mov_imm_LSLH_transform_operand:$Simm)>;
 
 multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
                                    SDPatternOperator neonopnode,
                                    Instruction INST4H,
-                                   Instruction INST8H> {
+                                   Instruction INST8H,
+                                   Instruction INST2S,
+                                   Instruction INST4S> {
   def : Pat<(v8i8 (opnode VPR64:$src,
                     (bitconvert(v4i16 (neonopnode timm:$Imm,
                       neon_mov_imm_LSLH_operand:$Simm))))),
             (INST4H VPR64:$src, neon_uimm8:$Imm,
               neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v2i32 (opnode VPR64:$src,
+                   (bitconvert(v4i16 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+            (INST4H VPR64:$src, neon_uimm8:$Imm,
+              neon_mov_imm_LSLH_operand:$Simm)>;
   def : Pat<(v1i64 (opnode VPR64:$src,
                   (bitconvert(v4i16 (neonopnode timm:$Imm,
                     neon_mov_imm_LSLH_operand:$Simm))))),
@@ -1336,13 +1520,47 @@ multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
                      neon_mov_imm_LSLH_operand:$Simm))))),
           (INST8H VPR128:$src, neon_uimm8:$Imm,
             neon_mov_imm_LSLH_operand:$Simm)>;
+
+  def : Pat<(v8i8 (opnode VPR64:$src,
+                    (bitconvert(v2i32 (neonopnode timm:$Imm,
+                      neon_mov_imm_LSLH_operand:$Simm))))),
+            (INST2S VPR64:$src, neon_uimm8:$Imm,
+              neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v4i16 (opnode VPR64:$src,
+                   (bitconvert(v2i32 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+            (INST2S VPR64:$src, neon_uimm8:$Imm,
+              neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v1i64 (opnode VPR64:$src,
+                  (bitconvert(v2i32 (neonopnode timm:$Imm,
+                    neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST2S VPR64:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+
+  def : Pat<(v16i8 (opnode VPR128:$src,
+                   (bitconvert(v4i32 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST4S VPR128:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v8i16 (opnode VPR128:$src,
+                   (bitconvert(v4i32 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST4S VPR128:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v2i64 (opnode VPR128:$src,
+                   (bitconvert(v4i32 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST4S VPR128:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
 }
 
 // Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
-defm : Neon_bitwiseVi_patterns<or, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H>;
+defm : Neon_bitwiseVi_patterns<and, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H,
+                               BICvi_lsl_2S, BICvi_lsl_4S>;
 
 // Additional patterns for Vector Bitwise OR - immedidate
-defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H>;
+defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H,
+                               ORRvi_lsl_2S, ORRvi_lsl_4S>;
 
 
 // Vector Move Immediate Masked
@@ -1391,7 +1609,8 @@ def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
                                "movi\t$Rd.8b, $Imm",
                                [(set (v8i8 VPR64:$Rd),
                                   (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                NoItinerary> {
+                                NoItinerary>,
+                Sched<[WriteFPALU]> {
   let cmode = 0b1110;
 }
 
@@ -1400,7 +1619,8 @@ def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
                                 "movi\t$Rd.16b, $Imm",
                                 [(set (v16i8 VPR128:$Rd),
                                    (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                 NoItinerary> {
+                                 NoItinerary>,
+                Sched<[WriteFPALU]> {
   let cmode = 0b1110;
 }
 }
@@ -1412,7 +1632,8 @@ def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
                                "movi\t $Rd.2d, $Imm",
                                [(set (v2i64 VPR128:$Rd),
                                   (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                               NoItinerary> {
+                               NoItinerary>,
+                Sched<[WriteFPALU]> {
   let cmode = 0b1110;
 }
 }
@@ -1425,7 +1646,8 @@ def MOVIdi : NeonI_1VModImm<0b0, 0b1,
                            "movi\t $Rd, $Imm",
                            [(set (v1i64 FPR64:$Rd),
                              (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                           NoItinerary> {
+                           NoItinerary>,
+             Sched<[WriteFPALU]> {
   let cmode = 0b1110;
 }
 }
@@ -1439,7 +1661,8 @@ class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
                    "fmov\t$Rd" # asmlane # ", $Imm",
                    [(set (OpTy VPRC:$Rd),
                       (OpTy (Neon_fmovi (timm:$Imm))))],
-                   NoItinerary> {
+                   NoItinerary>,
+    Sched<[WriteFPALU]> {
      let cmode = 0b1111;
    }
 
@@ -1450,10 +1673,6 @@ def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
 }
 
 // Vector Shift (Immediate)
-// Immediate in [0, 63]
-def imm0_63 : Operand<i32> {
-  let ParserMatchClass = uimm6_asmoperand;
-}
 
 // Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
 // as follows:
@@ -1522,7 +1741,8 @@ class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
                      [(set (Ty VPRC:$Rd),
                         (Ty (OpNode (Ty VPRC:$Rn),
                           (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>;
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
   // 64-bit vector types.
@@ -1594,12 +1814,73 @@ multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
 }
 
 // Shift left
+
 defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
 
+// Additional patterns to match vector shift left by immediate.
+// (v1i8/v1i16/v1i32 types)
+def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn),
+                     (v1i8 (Neon_vdup (i32 (shl_imm8:$Imm)))))),
+          (EXTRACT_SUBREG
+              (SHLvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                          shl_imm8:$Imm),
+              sub_8)>;
+def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn),
+                      (v1i16 (Neon_vdup (i32 (shl_imm16:$Imm)))))),
+          (EXTRACT_SUBREG
+              (SHLvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                          shl_imm16:$Imm),
+              sub_16)>;
+def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn),
+                      (v1i32 (Neon_vdup (i32 (shl_imm32:$Imm)))))),
+          (EXTRACT_SUBREG
+              (SHLvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                          shl_imm32:$Imm),
+              sub_32)>;
+
 // Shift right
 defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
 defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
 
+// Additional patterns to match vector shift right by immediate.
+// (v1i8/v1i16/v1i32 types)
+def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn),
+                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
+          (EXTRACT_SUBREG
+              (SSHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                          shr_imm8:$Imm),
+              sub_8)>;
+def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn),
+                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
+          (EXTRACT_SUBREG
+              (SSHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                          shr_imm16:$Imm),
+              sub_16)>;
+def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn),
+                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
+          (EXTRACT_SUBREG
+              (SSHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                          shr_imm32:$Imm),
+              sub_32)>;
+def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn),
+                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
+          (EXTRACT_SUBREG
+              (USHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                          shr_imm8:$Imm),
+              sub_8)>;
+def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn),
+                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
+          (EXTRACT_SUBREG
+              (USHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                          shr_imm16:$Imm),
+              sub_16)>;
+def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn),
+                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
+          (EXTRACT_SUBREG
+              (USHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                          shr_imm32:$Imm),
+              sub_32)>;
+
 def Neon_High16B : PatFrag<(ops node:$in),
                            (extract_subvector (v16i8 node:$in), (iPTR 8))>;
 def Neon_High8H  : PatFrag<(ops node:$in),
@@ -1642,7 +1923,8 @@ class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
                         (DestTy (shl
                           (DestTy (ExtOp (SrcTy VPR64:$Rn))),
                             (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>;
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
                        string SrcT, ValueType DestTy, ValueType SrcTy,
@@ -1656,7 +1938,8 @@ class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
                           (DestTy (ExtOp
                             (SrcTy (getTop VPR128:$Rn)))),
                               (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>;
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
                          SDNode ExtOp> {
@@ -1716,6 +1999,38 @@ multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
 defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
 defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
 
+class NeonI_ext_len_alias<string asmop, string lane, string laneOp,
+                       Instruction inst, RegisterOperand VPRC,
+                       RegisterOperand VPRCOp>
+  : NeonInstAlias<asmop # "\t$Rd" # lane #", $Rn" # laneOp,
+                  (inst VPRC:$Rd, VPRCOp:$Rn, 0), 0b0>;
+
+// Signed integer lengthen (vector) is alias for SSHLL Vd, Vn, #0
+// Signed integer lengthen (vector, second part) is alias for SSHLL2 Vd, Vn, #0
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
+def SXTLvv_8B  : NeonI_ext_len_alias<"sxtl", ".8h", ".8b",  SSHLLvvi_8B, VPR128, VPR64>;
+def SXTLvv_4H  : NeonI_ext_len_alias<"sxtl", ".4s", ".4h",  SSHLLvvi_4H, VPR128, VPR64>;
+def SXTLvv_2S  : NeonI_ext_len_alias<"sxtl", ".2d", ".2s",  SSHLLvvi_2S, VPR128, VPR64>;
+def SXTL2vv_16B : NeonI_ext_len_alias<"sxtl2", ".8h", ".16b",  SSHLLvvi_16B, VPR128, VPR128>;
+def SXTL2vv_8H  : NeonI_ext_len_alias<"sxtl2", ".4s", ".8h",  SSHLLvvi_8H, VPR128, VPR128>;
+def SXTL2vv_4S  : NeonI_ext_len_alias<"sxtl2", ".2d", ".4s",  SSHLLvvi_4S, VPR128, VPR128>;
+
+// Unsigned integer lengthen (vector) is alias for USHLL Vd, Vn, #0
+// Unsigned integer lengthen (vector, second part) is alias for USHLL2 Vd, Vn, #0
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
+def UXTLvv_8B  : NeonI_ext_len_alias<"uxtl", ".8h", ".8b",  USHLLvvi_8B, VPR128, VPR64>;
+def UXTLvv_4H  : NeonI_ext_len_alias<"uxtl", ".4s", ".4h",  USHLLvvi_4H, VPR128, VPR64>;
+def UXTLvv_2S  : NeonI_ext_len_alias<"uxtl", ".2d", ".2s",  USHLLvvi_2S, VPR128, VPR64>;
+def UXTL2vv_16B : NeonI_ext_len_alias<"uxtl2", ".8h", ".16b",  USHLLvvi_16B, VPR128, VPR128>;
+def UXTL2vv_8H  : NeonI_ext_len_alias<"uxtl2", ".4s", ".8h",  USHLLvvi_8H, VPR128, VPR128>;
+def UXTL2vv_4S  : NeonI_ext_len_alias<"uxtl2", ".2d", ".4s",  USHLLvvi_4S, VPR128, VPR128>;
+
+def : Pat<(v8i16 (anyext (v8i8 VPR64:$Rn))), (USHLLvvi_8B VPR64:$Rn, 0)>;
+def : Pat<(v4i32 (anyext (v4i16 VPR64:$Rn))), (USHLLvvi_4H VPR64:$Rn, 0)>;
+def : Pat<(v2i64 (anyext (v2i32 VPR64:$Rn))), (USHLLvvi_2S VPR64:$Rn, 0)>;
+
 // Rounding/Saturating shift
 class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
                   RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
@@ -1725,7 +2040,8 @@ class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
                      asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
                      [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
                         (i32 ImmTy:$Imm))))],
-                     NoItinerary>;
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 // shift right (vector by immediate)
 multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
@@ -1828,7 +2144,8 @@ class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
            [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
               (Ty (OpNode (Ty VPRC:$Rn),
                 (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
-           NoItinerary> {
+           NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -1883,7 +2200,8 @@ class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
                      asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
                      [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
                         (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
-                     NoItinerary> {
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -1938,7 +2256,8 @@ class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
            asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
            [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
              (i32 ImmTy:$Imm))))],
-           NoItinerary> {
+           NoItinerary>,
+      Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -2032,14 +2351,16 @@ class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
   : NeonI_2VShiftImm<q, u, opcode,
                      (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
                      asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>;
+                     [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
                        string SrcT, Operand ImmTy>
   : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
                      (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
                      asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary> {
+                     [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -2198,7 +2519,8 @@ class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
                      asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
                      [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
                        (i32 ImmTy:$Imm))))],
-                     NoItinerary>;
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
                               SDPatternOperator IntOp> {
@@ -2276,28 +2598,32 @@ multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
                 asmop # "\t$Rd, $Rn.8b",
                 [(set (v1i16 FPR16:$Rd),
                     (v1i16 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
                 (outs FPR16:$Rd), (ins VPR128:$Rn),
                 asmop # "\t$Rd, $Rn.16b",
                 [(set (v1i16 FPR16:$Rd),
                     (v1i16 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
                 (outs FPR32:$Rd), (ins VPR64:$Rn),
                 asmop # "\t$Rd, $Rn.4h",
                 [(set (v1i32 FPR32:$Rd),
                     (v1i32 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
                 (outs FPR32:$Rd), (ins VPR128:$Rn),
                 asmop # "\t$Rd, $Rn.8h",
                 [(set (v1i32 FPR32:$Rd),
                     (v1i32 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     // _1d2s doesn't exist!
 
@@ -2306,7 +2632,8 @@ multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
                 asmop # "\t$Rd, $Rn.4s",
                 [(set (v1i64 FPR64:$Rd),
                     (v1i64 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
@@ -2322,28 +2649,32 @@ multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
                 asmop # "\t$Rd, $Rn.8b",
                 [(set (v1i8 FPR8:$Rd),
                     (v1i8 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
                 (outs FPR8:$Rd), (ins VPR128:$Rn),
                 asmop # "\t$Rd, $Rn.16b",
                 [(set (v1i8 FPR8:$Rd),
                     (v1i8 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
                 (outs FPR16:$Rd), (ins VPR64:$Rn),
                 asmop # "\t$Rd, $Rn.4h",
                 [(set (v1i16 FPR16:$Rd),
                     (v1i16 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
                 (outs FPR16:$Rd), (ins VPR128:$Rn),
                 asmop # "\t$Rd, $Rn.8h",
                 [(set (v1i16 FPR16:$Rd),
                     (v1i16 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     // _1s2s doesn't exist!
 
@@ -2352,7 +2683,8 @@ multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
                 asmop # "\t$Rd, $Rn.4s",
                 [(set (v1i32 FPR32:$Rd),
                     (v1i32 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>;
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
@@ -2370,9 +2702,10 @@ multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
     def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
                 (outs FPR32:$Rd), (ins VPR128:$Rn),
                 asmop # "\t$Rd, $Rn.4s",
-                [(set (v1f32 FPR32:$Rd),
-                    (v1f32 (opnode (v4f32 VPR128:$Rn))))],
-                NoItinerary>;
+                [(set (f32 FPR32:$Rd),
+                    (f32 (opnode (v4f32 VPR128:$Rn))))],
+                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
@@ -2395,7 +2728,8 @@ class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
                asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
                [(set (Ty OpVPR:$Rd),
                   (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
-               NoItinerary>;
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
                           SDPatternOperator opnode> {
@@ -2454,7 +2788,8 @@ class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
                  [(set (ResTy VPR128:$Rd),
                     (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
                                    (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
                         string asmop, SDPatternOperator opnode,
@@ -2529,7 +2864,8 @@ class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
                  [(set (ResTy VPR128:$Rd),
                     (ResTy (opnode (ResTy VPR128:$Rn),
                                    (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
                         SDPatternOperator opnode> {
@@ -2610,7 +2946,8 @@ class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
                     (ResTy (get_hi
                       (OpTy (opnode (OpTy VPR128:$Rn),
                                     (OpTy VPR128:$Rm))))))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
                                 SDPatternOperator opnode, bit Commutable = 0> {
@@ -2638,7 +2975,8 @@ class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
                  asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
                  [(set (ResTy ResVPR:$Rd),
                     (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 // normal narrow pattern
 multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
@@ -2662,7 +3000,8 @@ class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
   : NeonI_3VDiff<q, u, size, opcode,
                  (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
                  asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [], NoItinerary> {
+                 [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
   let neverHasSideEffects = 1;
 }
@@ -2727,7 +3066,8 @@ class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
                  [(set (ResTy VPR128:$Rd),
                     (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
                                                 (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
                            SDPatternOperator opnode, bit Commutable = 0> {
@@ -2795,7 +3135,8 @@ class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
                       (ResTy VPR128:$src),
                       (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
                                                  (OpTy OpVPR:$Rm))))))))],
-                 NoItinerary> {
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -2835,7 +3176,8 @@ defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
 // Long pattern with 2 operands
 multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
                           SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
+  let isCommutable = Commutable,
+      SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
     def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
                               opnode, VPR128, VPR64, v8i16, v8i8>;
     def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
@@ -2857,7 +3199,8 @@ class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
                  asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
                  [(set (ResTy VPR128:$Rd),
                     (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
-                 NoItinerary>;
+                 NoItinerary>,
+    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
 
 multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
                                    string opnode, bit Commutable = 0> {
@@ -2891,7 +3234,8 @@ class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
                     (ResTy (opnode
                       (ResTy VPR128:$src),
                       (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
-               NoItinerary> {
+               NoItinerary>,
+    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -2939,7 +3283,8 @@ class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
                   (ResTy (subop
                     (ResTy VPR128:$src),
                     (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
-               NoItinerary> {
+               NoItinerary>,
+    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -2991,8 +3336,10 @@ multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
   }
 }
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
                                 int_arm_neon_vqdmull, 1>;
+}
 
 multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
                                    string opnode, bit Commutable = 0> {
@@ -3025,19 +3372,20 @@ defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
                                              int_arm_neon_vqsubs>;
 
 multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
+                         SDPatternOperator opnode_8h8b,
+                         SDPatternOperator opnode_1q1d, bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode, VPR128, VPR64, v8i16, v8i8>;
+                              opnode_8h8b, VPR128, VPR64, v8i16, v8i8>;
 
-    def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode,
-                             (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-                             asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d",
-                             [], NoItinerary>;
+    def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d",
+                              opnode_1q1d, VPR128, VPR64, v16i8, v1i64>;
   }
 }
 
-defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>;
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in
+defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp,
+                              int_aarch64_neon_vmull_p64, 1>;
 
 multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
                                    string opnode, bit Commutable = 0> {
@@ -3046,11 +3394,24 @@ multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
                                       !cast<PatFrag>(opnode # "_16B"),
                                       v8i16, v16i8>;
 
-    def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                             asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
-                             [], NoItinerary>;
+    def _1q2d : 
+      NeonI_3VDiff<0b1, u, 0b11, opcode,
+                   (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                   asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
+                   [(set (v16i8 VPR128:$Rd),
+                      (v16i8 (int_aarch64_neon_vmull_p64 
+                        (v1i64 (scalar_to_vector
+                          (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))),
+                        (v1i64 (scalar_to_vector
+                          (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))],
+                   NoItinerary>,
+      Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
   }
+
+  def : Pat<(v16i8 (int_aarch64_neon_vmull_p64
+                      (v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 1))),
+                      (v1i64 (extract_subvector (v2i64 VPR128:$Rm), (i64 1))))),
+            (!cast<Instruction>(NAME # "_1q2d") VPR128:$Rn, VPR128:$Rm)>;
 }
 
 defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
@@ -3080,7 +3441,8 @@ class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
                  (outs VecList:$Rt), (ins GPR64xsp:$Rn),
                  asmop # "\t$Rt, [$Rn]",
                  [],
-                 NoItinerary> {
+                 NoItinerary>,
+    Sched<[WriteVecLd, ReadVecLd]> {
   let mayLoad = 1;
   let neverHasSideEffects = 1;
 }
@@ -3134,7 +3496,8 @@ class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
                  (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
                  asmop # "\t$Rt, [$Rn]",
                  [],
-                 NoItinerary> {
+                 NoItinerary>,
+    Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
   let mayStore = 1;
   let neverHasSideEffects = 1;
 }
@@ -3230,6 +3593,21 @@ def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
 def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
           (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
 
+// Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
+// FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
+// these patterns are not needed any more.
+def : Pat<(v1i8 (load GPR64xsp:$addr)), (LSFP8_LDR $addr, 0)>;
+def : Pat<(v1i16 (load GPR64xsp:$addr)), (LSFP16_LDR $addr, 0)>;
+def : Pat<(v1i32 (load GPR64xsp:$addr)), (LSFP32_LDR $addr, 0)>;
+
+def : Pat<(store (v1i8 FPR8:$value), GPR64xsp:$addr),
+          (LSFP8_STR $value, $addr, 0)>;
+def : Pat<(store (v1i16 FPR16:$value), GPR64xsp:$addr),
+          (LSFP16_STR $value, $addr, 0)>;
+def : Pat<(store (v1i32 FPR32:$value), GPR64xsp:$addr),
+          (LSFP32_STR $value, $addr, 0)>;
+
+
 // End of vector load/store multiple N-element structure(class SIMD lselem)
 
 // The followings are post-index vector load/store multiple N-element
@@ -3352,7 +3730,8 @@ multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
                      (ins GPR64xsp:$Rn, ImmTy:$amt),
                      asmop # "\t$Rt, [$Rn], $amt",
                      [],
-                     NoItinerary> {
+                     NoItinerary>,
+                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
       let Rm = 0b11111;
     }
 
@@ -3361,7 +3740,8 @@ multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
                         (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
                         asmop # "\t$Rt, [$Rn], $Rm",
                         [],
-                        NoItinerary>;
+                        NoItinerary>,
+                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
   }
 }
 
@@ -3435,7 +3815,8 @@ multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
                      (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
                      asmop # "\t$Rt, [$Rn], $amt",
                      [],
-                     NoItinerary> {
+                     NoItinerary>,
+                 Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
       let Rm = 0b11111;
     }
 
@@ -3444,7 +3825,8 @@ multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
                       (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
                       asmop # "\t$Rt, [$Rn], $Rm",
                       [],
-                      NoItinerary>;
+                      NoItinerary>,
+                    Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
   }
 }
 
@@ -3548,7 +3930,8 @@ class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
                       (outs VecList:$Rt), (ins GPR64xsp:$Rn),
                       asmop # "\t$Rt, [$Rn]",
                       [],
-                      NoItinerary> {
+                      NoItinerary>,
+      Sched<[WriteVecLd, ReadVecLd]> {
   let mayLoad = 1;
   let neverHasSideEffects = 1;
 }
@@ -3609,12 +3992,16 @@ def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
 def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
 def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
 
-def : LD1R_pattern<v1i64, i64, load, LD1R_1D>;
-def : LD1R_pattern<v1f64, f64, load, LD1R_1D>;
-
 def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
 def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
 
+class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
+                       Instruction INST>
+  : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
+        (VTy (INST GPR64xsp:$Rn))>;
+
+def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
+def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
 
 multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
                                 RegisterClass RegList> {
@@ -3638,7 +4025,8 @@ class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
                          (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
                          asmop # "\t$Rt[$lane], [$Rn]",
                          [],
-                         NoItinerary> {
+                         NoItinerary>,
+      Sched<[WriteVecLd, ReadVecLd, ReadVecLd]> {
   let mayLoad = 1;
   let neverHasSideEffects = 1;
   let hasExtraDefRegAllocReq = 1;
@@ -3723,7 +4111,8 @@ class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
                          (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
                          asmop # "\t$Rt[$lane], [$Rn]",
                          [],
-                         NoItinerary> {
+                         NoItinerary>,
+      Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
   let mayStore = 1;
   let neverHasSideEffects = 1;
   let hasExtraDefRegAllocReq = 1;
@@ -3815,16 +4204,18 @@ multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
                       (ins GPR64xsp:$Rn, ImmTy:$amt),
                       asmop # "\t$Rt, [$Rn], $amt",
                       [],
-                      NoItinerary> {
-                        let Rm = 0b11111;
-                      }
+                      NoItinerary>,
+                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
+      let Rm = 0b11111;
+    }
 
     def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
                       (outs VecList:$Rt, GPR64xsp:$wb),
                       (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
                       asmop # "\t$Rt, [$Rn], $Rm",
                       [],
-                      NoItinerary>;
+                      NoItinerary>,
+                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
   }
 }
 
@@ -3888,7 +4279,8 @@ let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
                                     VList:$src, ImmOp:$lane),
                                 asmop # "\t$Rt[$lane], [$Rn], $amt",
                                 [],
-                                NoItinerary> {
+                                NoItinerary>,
+        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]> {
     let Rm = 0b11111;
   }
 
@@ -3900,7 +4292,8 @@ let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
                                     VList:$src, ImmOp:$lane),
                                 asmop # "\t$Rt[$lane], [$Rn], $Rm",
                                 [],
-                                NoItinerary>;
+                                NoItinerary>,
+        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd, ReadVecLd]>;
 }
 
 multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
@@ -3988,7 +4381,8 @@ let mayStore = 1, neverHasSideEffects = 1,
                                     VList:$Rt, ImmOp:$lane),
                                 asmop # "\t$Rt[$lane], [$Rn], $amt",
                                 [],
-                                NoItinerary> {
+                                NoItinerary>,
+        Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
     let Rm = 0b11111;
   }
 
@@ -4000,7 +4394,8 @@ let mayStore = 1, neverHasSideEffects = 1,
                                     ImmOp:$lane),
                                 asmop # "\t$Rt[$lane], [$Rn], $Rm",
                                 [],
-                                NoItinerary>;
+                                NoItinerary>,
+        Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
 }
 
 multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
@@ -4088,7 +4483,8 @@ class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
                       (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
                       [],
-                      NoItinerary>;
+                      NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
   : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
@@ -4133,19 +4529,12 @@ multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
   : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
   def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
            (INSTB FPR8:$Rn, FPR8:$Rm)>;
-
   def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-
   def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
            (INSTS FPR32:$Rn, FPR32:$Rm)>;
 }
 
-class Neon_Scalar3Same_cmp_D_size_patterns<SDPatternOperator opnode,
-                                           Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
 multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
                                              Instruction INSTH,
                                              Instruction INSTS> {
@@ -4156,20 +4545,12 @@ multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
 }
 
 multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTS,
-                                             Instruction INSTD> {
-  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_cmp_SD_size_patterns<SDPatternOperator opnode,
-                                                 Instruction INSTS,
-                                                 Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+                                             ValueType SResTy, ValueType STy,
+                                             Instruction INSTS, ValueType DResTy,
+                                             ValueType DTy, Instruction INSTD> {
+  def : Pat<(SResTy (opnode (STy FPR32:$Rn), (STy FPR32:$Rm))),
             (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+  def : Pat<(DResTy (opnode (DTy FPR64:$Rn), (DTy FPR64:$Rm))),
             (INSTD FPR64:$Rn, FPR64:$Rm)>;
 }
 
@@ -4186,7 +4567,8 @@ class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
                       (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
                       [],
-                      NoItinerary>;
+                      NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
   def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
@@ -4199,12 +4581,14 @@ multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
                        (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
                        !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
                        [],
-                       NoItinerary>;
+                       NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
     def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
                        (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
                        !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
                        [],
-                       NoItinerary>;
+                       NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -4234,7 +4618,8 @@ class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asm
                           (outs FPRCD:$Rd), (ins FPRCS:$Rn),
                           !strconcat(asmop, "\t$Rd, $Rn"),
                           [],
-                          NoItinerary>;
+                          NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
                                          string asmop> {
@@ -4271,7 +4656,8 @@ class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
                           (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
                           !strconcat(asmop, "\t$Rd, $Rn"),
                           [],
-                          NoItinerary>;
+                          NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
                                                  string asmop> {
@@ -4286,56 +4672,68 @@ multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
 
 class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
                                                   Instruction INSTD>
-  : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))),
+  : Pat<(f32 (opnode (f64 FPR64:$Rn))),
         (INSTD FPR64:$Rn)>;
 
 multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
                                                       Instruction INSTS,
                                                       Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))),
+  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn))),
             (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
+  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn))),
             (INSTD FPR64:$Rn)>;
 }
 
-multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator Sopnode,
-                                                     SDPatternOperator Dopnode,
+class Neon_Scalar2SameMisc_vcvt_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+
+multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator opnode,
                                                      Instruction INSTS,
                                                      Instruction INSTD> {
-  def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))),
+  def : Pat<(f32 (opnode (v1i32 FPR32:$Rn))),
             (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))),
+  def : Pat<(f64 (opnode (v1i64 FPR64:$Rn))),
             (INSTD FPR64:$Rn)>;
 }
 
 multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
                                                  Instruction INSTS,
                                                  Instruction INSTD> {
-  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))),
+  def : Pat<(f32 (opnode (f32 FPR32:$Rn))),
             (INSTS FPR32:$Rn)>;
-  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
+  def : Pat<(f64 (opnode (f64 FPR64:$Rn))),
             (INSTD FPR64:$Rn)>;
 }
 
+class Neon_Scalar2SameMisc_V1_D_size_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD>
+  : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
+        (INSTD FPR64:$Rn)>;
+
 class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
   : NeonI_Scalar2SameMisc<u, 0b11, opcode,
                           (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
                           !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
                           [],
-                          NoItinerary>;
+                          NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
                                               string asmop> {
   def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
-                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpz32:$FPImm),
+                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpzz32:$FPImm),
                            !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
                            [],
-                           NoItinerary>;
+                           NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
   def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpz32:$FPImm),
+                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpzz32:$FPImm),
                            !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
                            [],
-                           NoItinerary>;
+                           NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
@@ -4351,14 +4749,15 @@ class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
         (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
 
 multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
+                                                      CondCode CC,
                                                       Instruction INSTS,
                                                       Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn),
-                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
-            (INSTS FPR32:$Rn, fpz32:$FPImm)>;
-  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn),
-                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
-            (INSTD FPR64:$Rn, fpz32:$FPImm)>;
+  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (f32 fpzz32:$FPImm))),
+            (INSTS FPR32:$Rn, fpzz32:$FPImm)>;
+  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (f32 fpzz32:$FPImm))),
+            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
+  def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpzz32:$FPImm), CC)),
+            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
 }
 
 multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
@@ -4418,7 +4817,8 @@ class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
   : NeonI_ScalarShiftImm<u, opcode,
                          (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>;
+                         [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
                                             string asmop> {
@@ -4483,7 +4883,8 @@ class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop
                          (outs FPR64:$Rd),
                          (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary> {
+                         [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
     bits<6> Imm;
     let Inst{22} = 0b1; // immh:immb = 1xxxxxx
     let Inst{21-16} = Imm;
@@ -4495,7 +4896,8 @@ class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
                          (outs FPR64:$Rd),
                          (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary> {
+                         [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
     bits<6> Imm;
     let Inst{22} = 0b1; // immh:immb = 1xxxxxx
     let Inst{21-16} = Imm;
@@ -4508,7 +4910,8 @@ class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
   : NeonI_ScalarShiftImm<u, opcode,
                          (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>;
+                         [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
                                                 string asmop> {
@@ -4557,8 +4960,14 @@ multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
                 (INSTD FPR64:$Rn, imm:$Imm)>;
 }
 
-class Neon_ScalarShiftImm_arm_D_size_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD>
+class Neon_ScalarShiftLImm_V1_D_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+            (v1i64 (Neon_vdup (i32 shl_imm64:$Imm))))),
+        (INSTD FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftRImm_V1_D_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTD>
   : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
             (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
         (INSTD FPR64:$Rn, imm:$Imm)>;
@@ -4602,23 +5011,21 @@ multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
                 (INSTD FPR64:$Rn, imm:$Imm)>;
 }
 
-multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator Sopnode,
-                                                      SDPatternOperator Dopnode,
+multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator opnode,
                                                       Instruction INSTS,
                                                       Instruction INSTD> {
-  def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+  def ssi : Pat<(f32 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
                 (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+  def ddi : Pat<(f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
                 (INSTD FPR64:$Rn, imm:$Imm)>;
 }
 
-multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator Sopnode,
-                                                      SDPatternOperator Dopnode,
+multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator opnode,
                                                       Instruction INSTS,
                                                       Instruction INSTD> {
-  def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+  def ssi : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
                 (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+  def ddi : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
                 (INSTD FPR64:$Rn, imm:$Imm)>;
 }
 
@@ -4626,13 +5033,13 @@ multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator Sopnode,
 defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
 defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
 // Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftImm_arm_D_size_patterns<sra, SSHRddi>;
+def : Neon_ScalarShiftRImm_V1_D_size_patterns<sra, SSHRddi>;
 
 // Scalar Unsigned Shift Right (Immediate)
 defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
 defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
 // Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftImm_arm_D_size_patterns<srl, USHRddi>;
+def : Neon_ScalarShiftRImm_V1_D_size_patterns<srl, USHRddi>;
 
 // Scalar Signed Rounding Shift Right (Immediate)
 defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
@@ -4666,7 +5073,7 @@ def : Neon_ScalarShiftRImm_accum_D_size_patterns
 defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
 defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
 // Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftImm_arm_D_size_patterns<shl, SHLddi>;
+def : Neon_ScalarShiftLImm_V1_D_size_patterns<shl, SHLddi>;
 
 // Signed Saturating Shift Left (Immediate)
 defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
@@ -4738,26 +5145,22 @@ defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
 
 // Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
 defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_s32,
-                                                  int_aarch64_neon_vcvtf64_n_s64,
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxs2fp_n,
                                                   SCVTF_Nssi, SCVTF_Nddi>;
 
 // Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
 defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_u32,
-                                                  int_aarch64_neon_vcvtf64_n_u64,
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxu2fp_n,
                                                   UCVTF_Nssi, UCVTF_Nddi>;
 
 // Scalar Floating-point Convert To Signed Fixed-point (Immediate)
 defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_s32_f32,
-                                                  int_aarch64_neon_vcvtd_n_s64_f64,
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxs_n,
                                                   FCVTZS_Nssi, FCVTZS_Nddi>;
 
 // Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
 defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_u32_f32,
-                                                  int_aarch64_neon_vcvtd_n_u64_f64,
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxu_n,
                                                   FCVTZU_Nssi, FCVTZU_Nddi>;
 
 // Patterns For Convert Instructions Between v1f64 and v1i64
@@ -4822,10 +5225,13 @@ defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
                                            UQSUBhhh, UQSUBsss, UQSUBddd>;
 
 // Scalar Integer Saturating Doubling Multiply Half High
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in
 defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
 
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
+}
 
 // Patterns to match llvm.arm.* intrinsic for
 // Scalar Integer Saturating Doubling Multiply Half High and
@@ -4835,23 +5241,24 @@ defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
 defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
                                                                 SQRDMULHsss>;
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
 // Scalar Floating-point Multiply Extended
 defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
+}
 
 // Scalar Floating-point Reciprocal Step
 defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrecps, f32, f32,
+                                         FRECPSsss, f64, f64, FRECPSddd>;
+def : Pat<(v1f64 (int_arm_neon_vrecps (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FRECPSddd FPR64:$Rn, FPR64:$Rm)>;
 
 // Scalar Floating-point Reciprocal Square Root Step
 defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Floating-point Reciprocal Step and
-// Scalar Floating-point Reciprocal Square Root Step
-defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrecps, FRECPSsss,
-                                                              FRECPSddd>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrsqrts, FRSQRTSsss,
-                                                               FRSQRTSddd>;
-
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrsqrts, f32, f32,
+                                         FRSQRTSsss, f64, f64, FRSQRTSddd>;
+def : Pat<(v1f64 (int_arm_neon_vrsqrts (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FRSQRTSddd FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
 
 // Patterns to match llvm.aarch64.* intrinsic for
@@ -4866,7 +5273,9 @@ multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
 }
 
 defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
-                                              FMULXsss,FMULXddd>;
+                                              FMULXsss, FMULXddd>;
+def : Pat<(v1f64 (int_aarch64_neon_vmulx (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMULXddd FPR64:$Rn, FPR64:$Rm)>;
 
 // Scalar Integer Shift Left (Signed, Unsigned)
 def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
@@ -4928,31 +5337,35 @@ defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
 defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
 defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
 
+let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
 // Signed Saturating Doubling Multiply-Add Long
 defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
+}
 defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
                                             SQDMLALshh, SQDMLALdss>;
 
 // Signed Saturating Doubling Multiply-Subtract Long
+let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
 defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
+}
 defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
                                             SQDMLSLshh, SQDMLSLdss>;
 
 // Signed Saturating Doubling Multiply Long
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
 defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
+}
 defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
                                          SQDMULLshh, SQDMULLdss>;
 
 // Scalar Signed Integer Convert To Floating-point
 defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_s32,
-                                                 int_aarch64_neon_vcvtf64_s64,
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fps,
                                                  SCVTFss, SCVTFdd>;
 
 // Scalar Unsigned Integer Convert To Floating-point
 defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_u32,
-                                                 int_aarch64_neon_vcvtf64_u64,
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fpu,
                                                  UCVTFss, UCVTFdd>;
 
 // Scalar Floating-point Converts
@@ -4963,42 +5376,54 @@ def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
 defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
                                                   FCVTNSss, FCVTNSdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtns, FCVTNSdd>;
 
 defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
                                                   FCVTNUss, FCVTNUdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtnu, FCVTNUdd>;
 
 defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
                                                   FCVTMSss, FCVTMSdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtms, FCVTMSdd>;
 
 defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
                                                   FCVTMUss, FCVTMUdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtmu, FCVTMUdd>;
 
 defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
                                                   FCVTASss, FCVTASdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtas, FCVTASdd>;
 
 defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
                                                   FCVTAUss, FCVTAUdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtau, FCVTAUdd>;
 
 defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
                                                   FCVTPSss, FCVTPSdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtps, FCVTPSdd>;
 
 defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
                                                   FCVTPUss, FCVTPUdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtpu, FCVTPUdd>;
 
 defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
                                                   FCVTZSss, FCVTZSdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzs,
+                                                FCVTZSdd>;
 
 defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
 defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
                                                   FCVTZUss, FCVTZUdd>;
+def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzu,
+                                                FCVTZUdd>;
 
 // Patterns For Convert Instructions Between v1f64 and v1i64
 class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
@@ -5017,8 +5442,10 @@ def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
 
 // Scalar Floating-point Reciprocal Estimate
 defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrecpe,
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpe,
                                              FRECPEss, FRECPEdd>;
+def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrecpe,
+                                              FRECPEdd>;
 
 // Scalar Floating-point Reciprocal Exponent
 defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
@@ -5027,8 +5454,10 @@ defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
 
 // Scalar Floating-point Reciprocal Square Root Estimate
 defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrsqrte,
-                                             FRSQRTEss, FRSQRTEdd>;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrsqrte,
+                                                 FRSQRTEss, FRSQRTEdd>;
+def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrsqrte,
+                                              FRSQRTEdd>;
 
 // Scalar Floating-point Round
 class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
@@ -5046,7 +5475,7 @@ def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
 
 // Scalar Compare Bitwise Equal
 def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
-def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
 
 class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
                                               Instruction INSTD,
@@ -5058,28 +5487,28 @@ def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
 
 // Scalar Compare Signed Greather Than Or Equal
 def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
-def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
 def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
 
 // Scalar Compare Unsigned Higher Or Same
 def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
-def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
 def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
 
 // Scalar Compare Unsigned Higher
 def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
-def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
 def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
 
 // Scalar Compare Signed Greater Than
 def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
-def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
 def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
 
 // Scalar Compare Bitwise Test Bits
 def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
-def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
-def : Neon_Scalar3Same_cmp_D_size_patterns<Neon_tst, CMTSTddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
+defm : Neon_Scalar3Same_D_size_patterns<Neon_tst, CMTSTddd>;
 
 // Scalar Compare Bitwise Equal To Zero
 def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
@@ -5115,63 +5544,65 @@ def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
 
 // Scalar Floating-point Compare Mask Equal
 defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
-defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vceq,
-                                             FCMEQsss, FCMEQddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fceq, v1i32, f32,
+                                         FCMEQsss, v1i64, f64, FCMEQddd>;
 def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
 
 // Scalar Floating-point Compare Mask Equal To Zero
 defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vceq,
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fceq, SETEQ,
                                                   FCMEQZssi, FCMEQZddi>;
-def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpz32:$FPImm), SETEQ)),
-          (FCMEQZddi FPR64:$Rn, fpz32:$FPImm)>;
 
 // Scalar Floating-point Compare Mask Greater Than Or Equal
 defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
-defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcge,
-                                             FCMGEsss, FCMGEddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcge, v1i32, f32,
+                                         FCMGEsss, v1i64, f64, FCMGEddd>;
 def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
 
 // Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
 defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcge,
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcge, SETGE,
                                                   FCMGEZssi, FCMGEZddi>;
 
 // Scalar Floating-point Compare Mask Greather Than
 defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
-defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcgt,
-                                             FCMGTsss, FCMGTddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcgt, v1i32, f32,
+                                         FCMGTsss, v1i64, f64, FCMGTddd>;
 def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
 
 // Scalar Floating-point Compare Mask Greather Than Zero
 defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcgt,
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcgt, SETGT,
                                                   FCMGTZssi, FCMGTZddi>;
 
 // Scalar Floating-point Compare Mask Less Than Or Equal To Zero
 defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vclez,
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fclez, SETLE,
                                                   FCMLEZssi, FCMLEZddi>;
 
 // Scalar Floating-point Compare Mask Less Than Zero
 defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcltz,
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcltz, SETLT,
                                                   FCMLTZssi, FCMLTZddi>;
 
 // Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
 defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
-defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcage,
-                                             FACGEsss, FACGEddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcage, v1i32, f32,
+                                         FACGEsss, v1i64, f64, FACGEddd>;
+def : Pat<(v1i64 (int_arm_neon_vacge (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FACGEddd FPR64:$Rn, FPR64:$Rm)>;
 
 // Scalar Floating-point Absolute Compare Mask Greater Than
 defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
-defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcagt,
-                                             FACGTsss, FACGTddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcagt, v1i32, f32,
+                                         FACGTsss, v1i64, f64, FACGTddd>;
+def : Pat<(v1i64 (int_arm_neon_vacgt (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FACGTddd FPR64:$Rn, FPR64:$Rm)>;
 
-// Scakar Floating-point Absolute Difference
+// Scalar Floating-point Absolute Difference
 defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd,
-                                         FABDsss, FABDddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd, f32, f32,
+                                         FABDsss, f64, f64, FABDddd>;
 
 // Scalar Absolute Value
 defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
@@ -5251,7 +5682,8 @@ multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
                                 (outs FPR64:$Rd), (ins VPR128:$Rn),
                                 !strconcat(asmop, "\t$Rd, $Rn.2d"),
                                 [],
-                                NoItinerary>;
+                                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
   }
 }
 
@@ -5263,7 +5695,8 @@ multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
                                 (outs FPR32:$Rd), (ins VPR64:$Rn),
                                 !strconcat(asmop, "\t$Rd, $Rn.2s"),
                                 [],
-                                NoItinerary>;
+                                NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
   }
 }
 
@@ -5293,54 +5726,38 @@ defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
 // Scalar Reduce minNum Pairwise (Floating Point)
 defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
 
-multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnodeS,
-                                            SDPatternOperator opnodeD,
+multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnode,
                                             Instruction INSTS,
                                             Instruction INSTD> {
-  def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))),
+  def : Pat<(f32 (opnode (v2f32 VPR64:$Rn))),
             (INSTS VPR64:$Rn)>;
-  def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))),
+  def : Pat<(f64 (opnode (v2f64 VPR128:$Rn))),
             (INSTD VPR128:$Rn)>;
 }
 
 // Patterns to match llvm.aarch64.* intrinsic for
 // Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
 defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
-  int_aarch64_neon_vpfaddq, FADDPvv_S_2S, FADDPvv_D_2D>;
+                                        FADDPvv_S_2S, FADDPvv_D_2D>;
 
 defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
-  int_aarch64_neon_vpmaxq, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+                                        FMAXPvv_S_2S, FMAXPvv_D_2D>;
 
 defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
-  int_aarch64_neon_vpminq, FMINPvv_S_2S, FMINPvv_D_2D>;
+                                        FMINPvv_S_2S, FMINPvv_D_2D>;
 
 defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
-  int_aarch64_neon_vpfmaxnmq, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+                                        FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
 
 defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
-  int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+                                        FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
 
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vaddv,
-    int_aarch64_neon_vaddv, FADDPvv_S_2S, FADDPvv_D_2D>;
-
-def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))),
+def : Pat<(f32 (int_aarch64_neon_vpfadd (v4f32 VPR128:$Rn))),
           (FADDPvv_S_2S (v2f32
                (EXTRACT_SUBREG
                    (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
                    sub_64)))>;
 
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxv,
-    int_aarch64_neon_vmaxv, FMAXPvv_S_2S, FMAXPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminv,
-    int_aarch64_neon_vminv, FMINPvv_S_2S, FMINPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxnmv,
-    int_aarch64_neon_vmaxnmv, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminnmv,
-    int_aarch64_neon_vminnmv, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
-
 // Scalar by element Arithmetic
 
 class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
@@ -5352,7 +5769,8 @@ class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
                              (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
                              asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
                              [],
-                             NoItinerary> {
+                             NoItinerary>,
+    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]> {
   bits<3> Imm;
   bits<5> MRm;
 }
@@ -5369,7 +5787,8 @@ class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode
                              (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
                              asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
                              [],
-                             NoItinerary> {
+                             NoItinerary>,
+    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
   let Constraints = "$src = $Rd";
   bits<3> Imm;
   bits<5> MRm;
@@ -5447,7 +5866,6 @@ defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
   FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
   v1f64, v2f64, neon_uimm0_bare>;
 
-
 // Scalar Floating Point fused multiply-add (scalar, by element)
 def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
   0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
@@ -5594,12 +6012,21 @@ multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
                  (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
              (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
 
+  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
+               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+
   //swapped operands
   def  : Pat<(ResTy (opnode
                (OpVTy (scalar_to_vector
                  (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
                  (OpVTy FPRC:$Rn))),
              (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode
+               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)),
+               (OpVTy FPRC:$Rn))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
 }
 
 
@@ -5691,6 +6118,13 @@ multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
              (ResTy (INST (ResTy ResFPRC:$Ra),
                (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
 
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode (OpTy FPRC:$Rn),
+                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+
   // swapped operands
   def  : Pat<(ResTy (opnode
                (ResTy ResFPRC:$Ra),
@@ -5700,6 +6134,14 @@ multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
                  (OpTy FPRC:$Rn))))),
              (ResTy (INST (ResTy ResFPRC:$Ra),
                (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode
+                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)),
+                 (OpTy FPRC:$Rn))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
 }
 
 // Patterns for Scalar Signed saturating
@@ -5732,38 +6174,6 @@ defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
   int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
   i32, VPR128Lo, neon_uimm2_bare>;
 
-// Scalar general arithmetic operation
-class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
-                                        Instruction INST>
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
-                                        Instruction INST>
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (INST FPR64:$Rn, FPR64:$Rm)>;
-
-class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
-                                        Instruction INST>
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
-              (v1f64 FPR64:$Ra))),
-          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
-
-def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
-def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
-
-def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
-def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
-
 // Scalar Signed saturating doubling multiply returning
 // high half (scalar, by element)
 def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
@@ -5850,6 +6260,38 @@ defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
   SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
   VPR128Lo, neon_uimm2_bare>;
 
+// Scalar general arithmetic operation
+class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (INST FPR64:$Rn, FPR64:$Rm)>;
+
+class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
+              (v1f64 FPR64:$Ra))),
+          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
+
+def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
+def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
+
+def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
+def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
+
 // Scalar Copy - DUP element to scalar
 class NeonI_Scalar_DUP<string asmop, string asmlane,
                        RegisterClass ResRC, RegisterOperand VPRC,
@@ -5857,7 +6299,8 @@ class NeonI_Scalar_DUP<string asmop, string asmlane,
   : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
                      asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
                      [],
-                     NoItinerary> {
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]> {
   bits<4> Imm;
 }
 
@@ -5874,23 +6317,28 @@ def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
   let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
 }
 
-multiclass NeonI_Scalar_DUP_Elt_pattern<Instruction DUPI, ValueType ResTy,
-  ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-  def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 0)),
+          (f32 (EXTRACT_SUBREG (v4f32 VPR128:$Rn), sub_32))>;
+def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 1)),
+          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 1))>;
+def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 2)),
+          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 2))>;
+def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 3)),
+          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 3))>;
 
-  def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-                OpNImm:$Imm))>;
-}
+def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 0)),
+          (f64 (EXTRACT_SUBREG (v2f64 VPR128:$Rn), sub_64))>;
+def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 1)),
+          (f64 (DUPdv_D (v2f64 VPR128:$Rn), 1))>;
+
+def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 0)),
+          (f32 (EXTRACT_SUBREG (v2f32 VPR64:$Rn), sub_32))>;
+def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 1)),
+          (f32 (DUPsv_S (v4f32 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            1))>;
 
-// Patterns for vector extract of FP data using scalar DUP instructions
-defm : NeonI_Scalar_DUP_Elt_pattern<DUPsv_S, f32,
-  v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Elt_pattern<DUPdv_D, f64,
-  v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+def : Pat<(f64 (vector_extract (v1f64 VPR64:$Rn), 0)),
+          (f64 (EXTRACT_SUBREG (v1f64 VPR64:$Rn), sub_64))>;
 
 multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
   ValueType ResTy, ValueType OpTy,Operand OpLImm,
@@ -5961,12 +6409,6 @@ defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
 defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
   v1i8, v16i8, i32, neon_uimm4_bare,
   v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
-  v1f64, v2f64, f64, neon_uimm1_bare,
-  v1f64, v2f64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
-  v1f32, v4f32, f32, neon_uimm2_bare,
-  v2f32, v4f32, neon_uimm1_bare>;
 defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
   v1i64, v2i64, i64, neon_uimm1_bare,
   v1i64, v2i64, neon_uimm0_bare>;
@@ -5979,12 +6421,6 @@ defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
 defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
   v1i8, v16i8, i32, neon_uimm4_bare,
   v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
-  v1f64, v2f64, f64, neon_uimm1_bare,
-  v1f64, v2f64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
-  v1f32, v4f32, f32, neon_uimm2_bare,
-  v2f32, v4f32, neon_uimm1_bare>;
 
 multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
                                   Instruction DUPI, Operand OpImm,
@@ -6016,6 +6452,101 @@ defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
 defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
 defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
 
+// The following is for sext/zext from v1xx to v1xx
+multiclass NeonI_ext<string prefix, SDNode ExtOp> {
+  // v1i32 -> v1i64
+  def : Pat<(v1i64 (ExtOp (v1i32 FPR32:$Rn))),
+            (EXTRACT_SUBREG 
+              (v2i64 (!cast<Instruction>(prefix # "_2S")
+                (v2i32 (SUBREG_TO_REG (i64 0), $Rn, sub_32)), 0)),
+              sub_64)>;
+  
+  // v1i16 -> v1i32
+  def : Pat<(v1i32 (ExtOp (v1i16 FPR16:$Rn))),
+            (EXTRACT_SUBREG 
+              (v4i32 (!cast<Instruction>(prefix # "_4H")
+                (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
+              sub_32)>;
+  
+  // v1i8 -> v1i16
+  def : Pat<(v1i16 (ExtOp (v1i8 FPR8:$Rn))),
+            (EXTRACT_SUBREG 
+              (v8i16 (!cast<Instruction>(prefix # "_8B")
+                (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
+              sub_16)>;
+}
+
+defm NeonI_zext : NeonI_ext<"USHLLvvi", zext>;
+defm NeonI_sext : NeonI_ext<"SSHLLvvi", sext>;
+
+// zext v1i8 -> v1i32
+def : Pat<(v1i32 (zext (v1i8 FPR8:$Rn))),
+          (v1i32 (EXTRACT_SUBREG
+            (v1i64 (SUBREG_TO_REG (i64 0),
+              (v1i8 (DUPbv_B
+                (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
+                0)),
+              sub_8)),
+            sub_32))>;
+
+// zext v1i8 -> v1i64
+def : Pat<(v1i64 (zext (v1i8 FPR8:$Rn))),
+          (v1i64 (SUBREG_TO_REG (i64 0),
+            (v1i8 (DUPbv_B
+              (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
+              0)),
+            sub_8))>;
+
+// zext v1i16 -> v1i64
+def : Pat<(v1i64 (zext (v1i16 FPR16:$Rn))),
+          (v1i64 (SUBREG_TO_REG (i64 0),
+            (v1i16 (DUPhv_H
+              (v8i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)),
+              0)),
+            sub_16))>;
+
+// sext v1i8 -> v1i32
+def : Pat<(v1i32 (sext (v1i8 FPR8:$Rn))),
+          (EXTRACT_SUBREG
+            (v4i32 (SSHLLvvi_4H
+              (v4i16 (SUBREG_TO_REG (i64 0),
+                (v1i16 (EXTRACT_SUBREG 
+                  (v8i16 (SSHLLvvi_8B
+                    (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
+                  sub_16)),
+                sub_16)), 0)),
+            sub_32)>;
+              
+// sext v1i8 -> v1i64
+def : Pat<(v1i64 (sext (v1i8 FPR8:$Rn))),
+          (EXTRACT_SUBREG 
+            (v2i64 (SSHLLvvi_2S
+              (v2i32 (SUBREG_TO_REG (i64 0),
+                (v1i32 (EXTRACT_SUBREG
+                  (v4i32 (SSHLLvvi_4H
+                    (v4i16 (SUBREG_TO_REG (i64 0),
+                      (v1i16 (EXTRACT_SUBREG 
+                        (v8i16 (SSHLLvvi_8B
+                          (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
+                        sub_16)),
+                      sub_16)), 0)),
+                  sub_32)),
+                sub_32)), 0)),
+            sub_64)>;
+
+  
+// sext v1i16 -> v1i64
+def : Pat<(v1i64 (sext (v1i16 FPR16:$Rn))),
+          (EXTRACT_SUBREG
+            (v2i64 (SSHLLvvi_2S
+              (v2i32 (SUBREG_TO_REG (i64 0),
+                (v1i32 (EXTRACT_SUBREG 
+                  (v4i32 (SSHLLvvi_4H
+                    (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
+                  sub_32)),
+                sub_32)), 0)),
+            sub_64)>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
@@ -6047,6 +6578,20 @@ def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
 
+def : Pat<(v1i64 (bitconvert (v1f64  VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64  VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64  VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v1f64  VPR64:$src))), (v8i8 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64  VPR64:$src))), (f64 VPR64:$src)>;
+
+def : Pat<(v1f64 (bitconvert (v1i64  VPR64:$src))), (v1f64 VPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32  VPR64:$src))), (v1f64 VPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2i32  VPR64:$src))), (v1f64 VPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16  VPR64:$src))), (v1f64 VPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  VPR64:$src))), (v1f64 VPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64  VPR64:$src))), (v1f64 VPR64:$src)>;
+
 // ..and 128-bit vector bitcasts...
 
 def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
@@ -6089,7 +6634,6 @@ def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
 def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
 def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
 def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
-def : Pat<(f32 (bitconvert (v1f32  FPR32:$src))), (f32 FPR32:$src)>;
 def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
 
 def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
@@ -6121,7 +6665,6 @@ def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
 def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
 def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
 def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f32 (bitconvert (f32  FPR32:$src))), (v1f32 FPR32:$src)>;
 def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
 
 def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
@@ -6168,7 +6711,8 @@ class NeonI_Extract<bit q, bits<2> op2, string asmop,
                      asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
                      ", $Rm." # OpS # ", $Index",
                      [],
-                     NoItinerary>{
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>{
   bits<4> Index;
 }
 
@@ -6209,7 +6753,8 @@ class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
               (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
               asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
               [],
-              NoItinerary>;
+              NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
 // The vectors in look up table are always 16b
 multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
@@ -6225,7 +6770,7 @@ defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
 defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
 defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
 
-// Table lookup extention
+// Table lookup extension
 class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
              string asmop, string OpS, RegisterOperand OpVPR,
              RegisterOperand VecList>
@@ -6233,7 +6778,8 @@ class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
               (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
               asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
               [],
-              NoItinerary> {
+              NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
 }
 
@@ -6261,7 +6807,8 @@ class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
                    (ResTy VPR128:$src),
                    (OpTy OpGPR:$Rn),
                    (OpImm:$Imm))))],
-               NoItinerary> {
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   bits<4> Imm;
   let Constraints = "$src = $Rd";
 }
@@ -6319,7 +6866,8 @@ class NeonI_INS_element<string asmop, string Res, Operand ResImm>
                  ResImm:$Immd, ResImm:$Immn),
                  asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
                  [],
-                 NoItinerary> {
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
   bits<4> Immd;
   bits<4> Immn;
@@ -6463,7 +7011,8 @@ class NeonI_SMOV<string asmop, string Res, bit Q,
                    (ResTy (vector_extract
                      (OpTy VPR128:$Rn), (OpImm:$Imm))),
                    eleTy)))],
-               NoItinerary> {
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]> {
   bits<4> Imm;
 }
 
@@ -6557,7 +7106,8 @@ class NeonI_UMOV<string asmop, string Res, bit Q,
                [(set (ResTy ResGPR:$Rd),
                   (ResTy (vector_extract
                     (OpTy VPR128:$Rn), (OpImm:$Imm))))],
-               NoItinerary> {
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]> {
   bits<4> Imm;
 }
 
@@ -6654,9 +7204,6 @@ def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
 def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
           (f64 FPR64:$Rn)>;
 
-def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))),
-          (f32 FPR32:$Rn)>;
-
 def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
           (v1i8 (EXTRACT_SUBREG (v16i8
             (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
@@ -6673,17 +7220,41 @@ def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
 def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
           (FMOVdx $src)>;
 
-def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (v1f32 FPR32:$Rn)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (v8i8 (EXTRACT_SUBREG (v16i8
+            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_64))>;
+
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (v4i16 (EXTRACT_SUBREG (v8i16
+            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_64))>;
+
+def : Pat<(v2i32 (scalar_to_vector GPR32:$Rn)),
+          (v2i32 (EXTRACT_SUBREG (v16i8
+            (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_64))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))>;
+
+def : Pat<(v4i32 (scalar_to_vector GPR32:$Rn)),
+          (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))>;
+
+def : Pat<(v2i64 (scalar_to_vector GPR64:$Rn)),
+          (INSdx (v2i64 (IMPLICIT_DEF)), $Rn, (i64 0))>;
+
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
+
 def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
           (v1f64 FPR64:$Rn)>;
 
-// begin ANDROID-CHANGED-3-14-2014
-// duplicate symbol error if this is not commented out
-//def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
-//          (FMOVdd $src)>;
-// end ANDROID-CHANGED-3-14-2014
-
 def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
                          (f64 FPR64:$src), sub_64)>;
@@ -6694,7 +7265,8 @@ class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
                (ins VPR128:$Rn, OpImm:$Imm),
                asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
                [],
-               NoItinerary> {
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]> {
   bits<4> Imm;
 }
 
@@ -6779,6 +7351,20 @@ def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
             (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
             (i64 0)))>;
 
+multiclass NeonI_DUP_pattern<Instruction DUPELT, ValueType ResTy,
+                             ValueType OpTy, RegisterClass OpRC,
+                             Operand OpNImm, SubRegIndex SubIndex> {
+def : Pat<(ResTy (Neon_vduplane (OpTy OpRC:$Rn), OpNImm:$Imm)),
+          (ResTy (DUPELT
+            (SUBREG_TO_REG (i64 0), OpRC:$Rn, SubIndex), OpNImm:$Imm))>;
+}
+
+defm : NeonI_DUP_pattern<DUPELT4h, v4i16, v1i16, FPR16, neon_uimm2_bare,sub_16>;
+defm : NeonI_DUP_pattern<DUPELT4s, v4i32, v1i32, FPR32, neon_uimm2_bare,sub_32>;
+defm : NeonI_DUP_pattern<DUPELT8b, v8i8, v1i8, FPR8, neon_uimm3_bare, sub_8>;
+defm : NeonI_DUP_pattern<DUPELT8h, v8i16, v1i16, FPR16, neon_uimm3_bare,sub_16>;
+defm : NeonI_DUP_pattern<DUPELT16b, v16i8, v1i8, FPR8, neon_uimm4_bare, sub_8>;
+
 class NeonI_DUP<bit Q, string asmop, string rdlane,
                 RegisterOperand ResVPR, ValueType ResTy,
                 RegisterClass OpGPR, ValueType OpTy>
@@ -6786,7 +7372,8 @@ class NeonI_DUP<bit Q, string asmop, string rdlane,
                asmop # "\t$Rd" # rdlane # ", $Rn",
                [(set (ResTy ResVPR:$Rd),
                  (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
-               NoItinerary>;
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
   let Inst{20-16} = 0b00001;
@@ -6846,6 +7433,19 @@ defm : Concat_Vector_Pattern<v2i64, v1i64>;
 defm : Concat_Vector_Pattern<v4f32, v2f32>;
 defm : Concat_Vector_Pattern<v2f64, v1f64>;
 
+def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), undef)),
+          (v2i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32))>;
+def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+          (EXTRACT_SUBREG 
+            (v4i32 (INSELs
+              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)),
+              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
+              (i64 1),
+              (i64 0))),
+            sub_64)>;
+def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rn))),
+          (DUPELT2s (v4i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32)), 0)>;
+
 //patterns for EXTRACT_SUBVECTOR
 def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
           (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
@@ -6874,7 +7474,8 @@ class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
                  asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
                  ", $Re." # EleOpS # "[$Index]",
                  [],
-                 NoItinerary> {
+                 NoItinerary>,
+    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
   bits<3> Index;
   bits<5> Re;
 
@@ -6973,7 +7574,8 @@ class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
                  asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
                  ", $Re." # EleOpS # "[$Index]",
                  [],
-                 NoItinerary> {
+                 NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   bits<3> Index;
   bits<5> Re;
 }
@@ -7012,9 +7614,11 @@ multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
   }
 }
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
 defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
 defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
+}
 
 // Pattern for lane in 128-bit vector
 class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
@@ -7087,8 +7691,10 @@ multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
   }
 }
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
 defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
+}
 
 class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
                          RegisterOperand OpVPR, RegisterOperand EleOpVPR,
@@ -7172,7 +7778,7 @@ class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
                        ValueType ResTy, ValueType OpTy,
                        SDPatternOperator coreop>
   : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
         (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
 
 // Pattern for lane 0
@@ -7396,14 +8002,14 @@ multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
   }
 }
 
+let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
 defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
 defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
 defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
+}
 
 def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
           (FMOVdd $src)>;
-def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))),
-          (FMOVss $src)>;
 
 // Pattern for lane in 128-bit vector
 class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
@@ -7615,7 +8221,8 @@ class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
                asmop # "\t$Rd." # Res # ", $Rn." # Res,
                [(set (ResTy ResVPR:$Rd),
                   (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
-               NoItinerary> ;
+               NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
                           v16i8, Neon_rev64>;
@@ -7654,42 +8261,48 @@ multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
                            asmop # "\t$Rd.8h, $Rn.16b",
                            [(set (v8i16 VPR128:$Rd),
                               (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
-                           NoItinerary>;
+                           NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
                           (outs VPR64:$Rd), (ins VPR64:$Rn),
                           asmop # "\t$Rd.4h, $Rn.8b",
                           [(set (v4i16 VPR64:$Rd),
                              (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
-                          NoItinerary>;
+                          NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
                            (outs VPR128:$Rd), (ins VPR128:$Rn),
                            asmop # "\t$Rd.4s, $Rn.8h",
                            [(set (v4i32 VPR128:$Rd),
                               (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
-                           NoItinerary>;
+                           NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
                           (outs VPR64:$Rd), (ins VPR64:$Rn),
                           asmop # "\t$Rd.2s, $Rn.4h",
                           [(set (v2i32 VPR64:$Rd),
                              (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
-                          NoItinerary>;
+                          NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
                            (outs VPR128:$Rd), (ins VPR128:$Rn),
                            asmop # "\t$Rd.2d, $Rn.4s",
                            [(set (v2i64 VPR128:$Rd),
                               (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
-                           NoItinerary>;
+                           NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
                           (outs VPR64:$Rd), (ins VPR64:$Rn),
                           asmop # "\t$Rd.1d, $Rn.2s",
                           [(set (v1i64 VPR64:$Rd),
                              (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
-                          NoItinerary>;
+                          NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
@@ -7697,6 +8310,11 @@ defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
 defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
                                 int_arm_neon_vpaddlu>;
 
+def : Pat<(v1i64 (int_aarch64_neon_saddlv (v2i32 VPR64:$Rn))),
+          (SADDLP2s1d $Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_uaddlv (v2i32 VPR64:$Rn))),
+          (UADDLP2s1d $Rn)>;
+
 multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                              SDPatternOperator Neon_Padd> {
   let Constraints = "$src = $Rd" in {
@@ -7706,7 +8324,8 @@ multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                              [(set (v8i16 VPR128:$Rd),
                                 (v8i16 (Neon_Padd
                                   (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
-                             NoItinerary>;
+                             NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
                             (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
@@ -7714,7 +8333,8 @@ multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                             [(set (v4i16 VPR64:$Rd),
                                (v4i16 (Neon_Padd
                                  (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
@@ -7722,7 +8342,8 @@ multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                             [(set (v4i32 VPR128:$Rd),
                                (v4i32 (Neon_Padd
                                  (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
                             (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
@@ -7730,7 +8351,8 @@ multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                             [(set (v2i32 VPR64:$Rd),
                                (v2i32 (Neon_Padd
                                  (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
@@ -7738,7 +8360,8 @@ multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                             [(set (v2i64 VPR128:$Rd),
                                (v2i64 (Neon_Padd
                                  (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
                             (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
@@ -7746,7 +8369,8 @@ multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
                             [(set (v1i64 VPR64:$Rd),
                                (v1i64 (Neon_Padd
                                  (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
-                            NoItinerary>;
+                            NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -7759,37 +8383,44 @@ multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
   def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
                          (outs VPR128:$Rd), (ins VPR128:$Rn),
                          asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>;
+                         [], NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.8h, $Rn.8h",
-                        [], NoItinerary>;
+                        [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.4s, $Rn.4s",
-                        [], NoItinerary>;
+                        [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.2d, $Rn.2d",
-                        [], NoItinerary>;
+                        [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
                          (outs VPR64:$Rd), (ins VPR64:$Rn),
                          asmop # "\t$Rd.8b, $Rn.8b",
-                         [], NoItinerary>;
+                         [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.4h, $Rn.4h",
-                        [], NoItinerary>;
+                        [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.2s, $Rn.2s",
-                        [], NoItinerary>;
+                        [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
@@ -7859,37 +8490,44 @@ multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
     def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                            asmop # "\t$Rd.16b, $Rn.16b",
-                           [], NoItinerary>;
+                           [], NoItinerary>,
+              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                           asmop # "\t$Rd.8h, $Rn.8h",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                           asmop # "\t$Rd.4s, $Rn.4s",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                           asmop # "\t$Rd.2d, $Rn.2d",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
                           (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
                           asmop # "\t$Rd.8b, $Rn.8b",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
                           (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
                           asmop # "\t$Rd.4h, $Rn.4h",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
                           (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
                           asmop # "\t$Rd.2s, $Rn.2s",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -7937,42 +8575,48 @@ multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
                          asmop # "\t$Rd.16b, $Rn.16b",
                          [(set (v16i8 VPR128:$Rd),
                             (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
-                         NoItinerary>;
+                         NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.8h, $Rn.8h",
                         [(set (v8i16 VPR128:$Rd),
                            (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.4s, $Rn.4s",
                         [(set (v4i32 VPR128:$Rd),
                            (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.8b, $Rn.8b",
                         [(set (v8i8 VPR64:$Rd),
                            (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.4h, $Rn.4h",
                         [(set (v4i16 VPR64:$Rd),
                            (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.2s, $Rn.2s",
                         [(set (v2i32 VPR64:$Rd),
                            (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
@@ -7983,12 +8627,14 @@ multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
   def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
                          (outs VPR128:$Rd), (ins VPR128:$Rn),
                          asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>;
+                         [], NoItinerary>,
+            Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.8b, $Rn.8b",
-                        [], NoItinerary>;
+                        [], NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
@@ -8046,21 +8692,24 @@ multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
                         asmop # "\t$Rd.4s, $Rn.4s",
                         [(set (v4f32 VPR128:$Rd),
                            (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.2d, $Rn.2d",
                         [(set (v2f64 VPR128:$Rd),
                            (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.2s, $Rn.2s",
                         [(set (v2f32 VPR64:$Rd),
                            (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
@@ -8070,33 +8719,39 @@ multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
   def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
                           (outs VPR64:$Rd), (ins VPR128:$Rn),
                           asmop # "\t$Rd.8b, $Rn.8h",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
                           (outs VPR64:$Rd), (ins VPR128:$Rn),
                           asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
                           (outs VPR64:$Rd), (ins VPR128:$Rn),
                           asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   let Constraints = "$Rd = $src" in {
     def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
                              (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                              asmop # "2\t$Rd.16b, $Rn.8h",
-                             [], NoItinerary>;
+                             [], NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                             asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                             asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -8149,37 +8804,43 @@ multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
                             (outs VPR128:$Rd),
                             (ins VPR64:$Rn, uimm_exact8:$Imm),
                             asmop # "\t$Rd.8h, $Rn.8b, $Imm",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
 
     def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
                             (outs VPR128:$Rd),
                             (ins VPR64:$Rn, uimm_exact16:$Imm),
                             asmop # "\t$Rd.4s, $Rn.4h, $Imm",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
 
     def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
                             (outs VPR128:$Rd),
                             (ins VPR64:$Rn, uimm_exact32:$Imm),
                             asmop # "\t$Rd.2d, $Rn.2s, $Imm",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
 
     def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
                             (outs VPR128:$Rd),
                             (ins VPR128:$Rn, uimm_exact8:$Imm),
                             asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+                Sched<[WriteFPALU, ReadFPALU]>;
 
     def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
                             (outs VPR128:$Rd),
                             (ins VPR128:$Rn, uimm_exact16:$Imm),
                             asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
 
     def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
                             (outs VPR128:$Rd),
                             (ins VPR128:$Rn, uimm_exact32:$Imm),
                             asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU]>;
   }
 }
 
@@ -8227,23 +8888,27 @@ multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
   def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
                           (outs VPR64:$Rd), (ins VPR128:$Rn),
                           asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
                           (outs VPR64:$Rd), (ins VPR128:$Rn),
                           asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   let Constraints = "$src = $Rd" in {
     def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                             asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
 
     def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                             asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>;
+                            [], NoItinerary>,
+               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
   }
 }
 
@@ -8281,21 +8946,23 @@ multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
   def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
                           (outs VPR64:$Rd), (ins VPR128:$Rn),
                           asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
                           asmop # "2\t$Rd.4s, $Rn.2d",
-                          [], NoItinerary> {
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
     let Constraints = "$src = $Rd";
   }
 
-  def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))),
+  def : Pat<(v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))),
             (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
 
   def : Pat<(v4f32 (concat_vectors
               (v2f32 VPR64:$src),
-              (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))),
+              (v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))))),
             (!cast<Instruction>(prefix # "2d4s")
                (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
                VPR128:$Rn)>;
@@ -8310,22 +8977,26 @@ multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
   def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
                           (outs VPR128:$Rd), (ins VPR64:$Rn),
                           asmop # "\t$Rd.4s, $Rn.4h",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
                           (outs VPR128:$Rd), (ins VPR64:$Rn),
                           asmop # "\t$Rd.2d, $Rn.2s",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
                           (outs VPR128:$Rd), (ins VPR128:$Rn),
                           asmop # "2\t$Rd.4s, $Rn.8h",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 
   def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
                           (outs VPR128:$Rd), (ins VPR128:$Rn),
                           asmop # "2\t$Rd.2d, $Rn.4s",
-                          [], NoItinerary>;
+                          [], NoItinerary>,
+             Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
@@ -8361,21 +9032,24 @@ multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
                         asmop # "\t$Rd.4s, $Rn.4s",
                         [(set (ResTy4s VPR128:$Rd),
                            (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
                         (outs VPR128:$Rd), (ins VPR128:$Rn),
                         asmop # "\t$Rd.2d, $Rn.2d",
                         [(set (ResTy2d VPR128:$Rd),
                            (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.2s, $Rn.2s",
                         [(set (ResTy2s VPR64:$Rd),
                            (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
@@ -8385,23 +9059,23 @@ multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
 }
 
 defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
-                                     int_aarch64_neon_fcvtns>;
+                                     int_arm_neon_vcvtns>;
 defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
-                                     int_aarch64_neon_fcvtnu>;
+                                     int_arm_neon_vcvtnu>;
 defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
-                                     int_aarch64_neon_fcvtps>;
+                                     int_arm_neon_vcvtps>;
 defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
-                                     int_aarch64_neon_fcvtpu>;
+                                     int_arm_neon_vcvtpu>;
 defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
-                                     int_aarch64_neon_fcvtms>;
+                                     int_arm_neon_vcvtms>;
 defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
-                                     int_aarch64_neon_fcvtmu>;
+                                     int_arm_neon_vcvtmu>;
 defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
 defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
 defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
-                                     int_aarch64_neon_fcvtas>;
+                                     int_arm_neon_vcvtas>;
 defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
-                                     int_aarch64_neon_fcvtau>;
+                                     int_arm_neon_vcvtau>;
 
 multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
                                   bits<5> opcode, SDPatternOperator Neon_Op> {
@@ -8430,7 +9104,9 @@ defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
                                     int_arm_neon_vrecpe>;
 defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
                                      int_arm_neon_vrsqrte>;
+let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
 defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
+}
 
 multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
                                bits<5> opcode, SDPatternOperator Neon_Op> {
@@ -8439,14 +9115,16 @@ multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
                         asmop # "\t$Rd.4s, $Rn.4s",
                         [(set (v4i32 VPR128:$Rd),
                            (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 
   def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
                         (outs VPR64:$Rd), (ins VPR64:$Rn),
                         asmop # "\t$Rd.2s, $Rn.2s",
                         [(set (v2i32 VPR64:$Rd),
                            (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>;
+                        NoItinerary>,
+           Sched<[WriteFPALU, ReadFPALU]>;
 }
 
 defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
@@ -8463,7 +9141,8 @@ class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
                      [(set (v16i8 VPR128:$Rd),
                         (v16i8 (opnode (v16i8 VPR128:$src),
                                        (v16i8 VPR128:$Rn))))],
-                     NoItinerary>{
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
   let Predicates = [HasNEON, HasCrypto];
 }
@@ -8478,7 +9157,8 @@ class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
                      asmop # "\t$Rd.16b, $Rn.16b",
                      [(set (v16i8 VPR128:$Rd),
                         (v16i8 (opnode (v16i8 VPR128:$Rn))))],
-                     NoItinerary>;
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]>;
 
 def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
 def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
@@ -8491,7 +9171,8 @@ class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
                      [(set (v4i32 VPR128:$Rd),
                         (v4i32 (opnode (v4i32 VPR128:$src),
                                        (v4i32 VPR128:$Rn))))],
-                     NoItinerary> {
+                     NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
   let Predicates = [HasNEON, HasCrypto];
 }
@@ -8506,13 +9187,16 @@ class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
   : NeonI_Crypto_SHA<size, opcode,
                      (outs FPR32:$Rd), (ins FPR32:$Rn),
                      asmop # "\t$Rd, $Rn",
-                     [(set (v1i32 FPR32:$Rd),
-                        (v1i32 (opnode (v1i32 FPR32:$Rn))))],
-                     NoItinerary> {
+                     [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU]> {
   let Predicates = [HasNEON, HasCrypto];
+  let hasSideEffects = 0;
 }
 
 def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
+def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
+          (COPY_TO_REGCLASS (SHA1H (COPY_TO_REGCLASS i32:$Rn, FPR32)), GPR32)>;
+
 
 class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
                            SDPatternOperator opnode>
@@ -8524,7 +9208,8 @@ class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
                           (v4i32 (opnode (v4i32 VPR128:$src),
                                          (v4i32 VPR128:$Rn),
                                          (v4i32 VPR128:$Rm))))],
-                       NoItinerary> {
+                       NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
   let Predicates = [HasNEON, HasCrypto];
 }
@@ -8544,7 +9229,8 @@ class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
                           (v4i32 (opnode (v4i32 FPR128:$src),
                                          (v4i32 FPR128:$Rn),
                                          (v4i32 VPR128:$Rm))))],
-                       NoItinerary> {
+                       NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
   let Predicates = [HasNEON, HasCrypto];
 }
@@ -8554,29 +9240,145 @@ def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
 def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
                                     int_arm_neon_sha256h2>;
 
-class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
+class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop>
   : NeonI_Crypto_3VSHA<size, opcode,
                        (outs FPR128:$Rd),
                        (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
                        asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [(set (v4i32 FPR128:$Rd),
-                          (v4i32 (opnode (v4i32 FPR128:$src),
-                                         (v1i32 FPR32:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary> {
+                       [], NoItinerary>,
+    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
   let Constraints = "$src = $Rd";
+  let hasSideEffects = 0;
   let Predicates = [HasNEON, HasCrypto];
 }
 
-def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>;
-def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>;
-def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>;
+def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c">;
+def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p">;
+def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m">;
+
+def : Pat<(int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
+          (SHA1C v4i32:$hash_abcd,
+                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
+def : Pat<(int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
+          (SHA1M v4i32:$hash_abcd,
+                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
+def : Pat<(int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
+          (SHA1P v4i32:$hash_abcd,
+                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
+
+// Additional patterns to match shl to USHL.
+def : Pat<(v8i8 (shl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
+          (USHLvvv_8B $Rn, $Rm)>;
+def : Pat<(v4i16 (shl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
+          (USHLvvv_4H $Rn, $Rm)>;
+def : Pat<(v2i32 (shl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
+          (USHLvvv_2S $Rn, $Rm)>;
+def : Pat<(v1i64 (shl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+          (USHLddd $Rn, $Rm)>;
+def : Pat<(v16i8 (shl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
+          (USHLvvv_16B $Rn, $Rm)>;
+def : Pat<(v8i16 (shl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
+          (USHLvvv_8H $Rn, $Rm)>;
+def : Pat<(v4i32 (shl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
+          (USHLvvv_4S $Rn, $Rm)>;
+def : Pat<(v2i64 (shl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
+          (USHLvvv_2D $Rn, $Rm)>;
+
+def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
+          (EXTRACT_SUBREG
+              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                          (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
+              sub_8)>;
+def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+          (EXTRACT_SUBREG
+              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                          (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
+              sub_16)>;
+def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+          (EXTRACT_SUBREG
+              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                          (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
+              sub_32)>;
+
+// Additional patterns to match sra, srl.
+// For a vector right shift by vector, the shift amounts of SSHL/USHL are
+// negative. Negate the vector of shift amount first.
+def : Pat<(v8i8 (srl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
+          (USHLvvv_8B $Rn, (NEG8b $Rm))>;
+def : Pat<(v4i16 (srl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
+          (USHLvvv_4H $Rn, (NEG4h $Rm))>;
+def : Pat<(v2i32 (srl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
+          (USHLvvv_2S $Rn, (NEG2s $Rm))>;
+def : Pat<(v1i64 (srl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+          (USHLddd $Rn, (NEGdd $Rm))>;
+def : Pat<(v16i8 (srl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
+          (USHLvvv_16B $Rn, (NEG16b $Rm))>;
+def : Pat<(v8i16 (srl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
+          (USHLvvv_8H $Rn, (NEG8h $Rm))>;
+def : Pat<(v4i32 (srl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
+          (USHLvvv_4S $Rn, (NEG4s $Rm))>;
+def : Pat<(v2i64 (srl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
+          (USHLvvv_2D $Rn, (NEG2d $Rm))>;
+
+def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
+          (EXTRACT_SUBREG
+              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
+              sub_8)>;
+def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+          (EXTRACT_SUBREG
+              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
+              sub_16)>;
+def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+          (EXTRACT_SUBREG
+              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
+              sub_32)>;
+
+def : Pat<(v8i8 (sra (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
+          (SSHLvvv_8B $Rn, (NEG8b $Rm))>;
+def : Pat<(v4i16 (sra (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
+          (SSHLvvv_4H $Rn, (NEG4h $Rm))>;
+def : Pat<(v2i32 (sra (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
+          (SSHLvvv_2S $Rn, (NEG2s $Rm))>;
+def : Pat<(v1i64 (sra (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+          (SSHLddd $Rn, (NEGdd $Rm))>;
+def : Pat<(v16i8 (sra (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
+          (SSHLvvv_16B $Rn, (NEG16b $Rm))>;
+def : Pat<(v8i16 (sra (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
+          (SSHLvvv_8H $Rn, (NEG8h $Rm))>;
+def : Pat<(v4i32 (sra (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
+          (SSHLvvv_4S $Rn, (NEG4s $Rm))>;
+def : Pat<(v2i64 (sra (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
+          (SSHLvvv_2D $Rn, (NEG2d $Rm))>;
+
+def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
+          (EXTRACT_SUBREG
+              (SSHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
+                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
+              sub_8)>;
+def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+          (EXTRACT_SUBREG
+              (SSHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
+                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
+              sub_16)>;
+def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+          (EXTRACT_SUBREG
+              (SSHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
+              sub_32)>;
 
 //
 // Patterns for handling half-precision values
 //
 
+// Convert between f16 value and f32 value
+def : Pat<(f32 (f16_to_f32 (i32 GPR32:$Rn))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw $Rn), sub_16))>;
+def : Pat<(i32 (f32_to_f16 (f32 FPR32:$Rn))),
+          (FMOVws (SUBREG_TO_REG (i64 0), (f16 (FCVThs $Rn)), sub_16))>;
+
 // Convert f16 value coming in as i16 value to f32
 def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
           (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 8cfb968..3842bfd 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -19,11 +19,11 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 75ec44f..06e1ffb 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -18,11 +18,11 @@
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/ADT/BitVector.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
@@ -76,6 +76,15 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+static bool hasFrameOffset(int opcode) {
+  return opcode != AArch64::LD1x2_8B  && opcode != AArch64::LD1x3_8B  &&
+         opcode != AArch64::LD1x4_8B  && opcode != AArch64::ST1x2_8B  &&
+         opcode != AArch64::ST1x3_8B  && opcode != AArch64::ST1x4_8B  &&
+         opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
+         opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B &&
+         opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B;
+}
+
 void
 AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
                                          int SPAdj,
@@ -110,8 +119,10 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
   int64_t Offset;
   Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
                                            IsCalleeSaveOp);
-
-  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+  // A vector load/store instruction doesn't have an offset operand.
+  bool HasOffsetOp = hasFrameOffset(MI.getOpcode());
+  if (HasOffsetOp)
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
 
   // DBG_VALUE instructions have no real restrictions so they can be handled
   // easily.
@@ -124,7 +135,7 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
   const AArch64InstrInfo &TII =
     *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
   int MinOffset, MaxOffset, OffsetScale;
-  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) {
+  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s || !HasOffsetOp) {
     MinOffset = 0;
     MaxOffset = 0xfff;
     OffsetScale = 1;
@@ -133,10 +144,12 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
     TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
   }
 
-  // The frame lowering has told us a base and offset it thinks we should use to
-  // access this variable, but it's still up to us to make sure the values are
-  // legal for the instruction in question.
-  if (Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) {
+  // There are two situations we don't use frame + offset directly in the
+  // instruction:
+  // (1) The offset can't really be scaled
+  // (2) Can't encode offset as it doesn't have an offset operand
+  if ((Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) ||
+      (!HasOffsetOp && Offset != 0)) {
     unsigned BaseReg =
       MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
     emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
@@ -150,7 +163,8 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
   assert(Offset >= 0 && "Unexpected negative offset from SP");
 
   MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
-  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+  if (HasOffsetOp)
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
 }
 
 unsigned
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 4e2022c..9de7abd 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -30,7 +30,6 @@ def dsub_0 : SubRegIndex<64>;
 def dsub_1 : SubRegIndex<64, 64>;
 def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
 def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
 }
 
 // Registers are identified with 5-bit ID numbers.
@@ -147,7 +146,7 @@ foreach Index = 0-31 in {
 }
 
 
-def FPR8 : RegisterClass<"AArch64", [i8, v1i8], 8,
+def FPR8 : RegisterClass<"AArch64", [v1i8], 8,
                           (sequence "B%u", 0, 31)> {
 }
 
@@ -155,7 +154,7 @@ def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
                           (sequence "H%u", 0, 31)> {
 }
 
-def FPR32 : RegisterClass<"AArch64", [f32, v1i32, v1f32], 32,
+def FPR32 : RegisterClass<"AArch64", [f32, v1i32], 32,
                           (sequence "S%u", 0, 31)> {
 }
 
@@ -206,7 +205,7 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
 //===----------------------------------------------------------------------===//
 //  Consecutive vector registers
 //===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31
+// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0
 def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
                               [(rotl FPR64, 0), (rotl FPR64, 1)]>;
                               
@@ -288,4 +287,4 @@ multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
 defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
 defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
 defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
-defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
-\ No newline at end of file
+defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index e17cdaa..ec8450b 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -7,4 +7,74 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// Generic processor itineraries for legacy compatibility.
+
 def GenericItineraries : ProcessorItineraries<[], [], []>;
+
+
+//===----------------------------------------------------------------------===//
+// Base SchedReadWrite types
+
+// Basic ALU
+def WriteALU : SchedWrite;  // Generic: may contain shift and/or ALU operation
+def WriteALUs : SchedWrite; // Shift only with no ALU operation
+def ReadALU : SchedRead;    // Operand not needed for shifting
+def ReadALUs : SchedRead;   // Operand needed for shifting
+
+// Multiply with optional accumulate
+def WriteMAC : SchedWrite;
+def ReadMAC : SchedRead;
+
+// Compares
+def WriteCMP : SchedWrite;
+def ReadCMP : SchedRead;
+
+// Division
+def WriteDiv : SchedWrite;
+def ReadDiv : SchedRead;
+
+// Loads
+def WriteLd : SchedWrite;
+def WritePreLd : SchedWrite;
+def WriteVecLd : SchedWrite;
+def ReadLd : SchedRead;
+def ReadPreLd : SchedRead;
+def ReadVecLd : SchedRead;
+
+// Stores
+def WriteSt : SchedWrite;
+def WriteVecSt : SchedWrite;
+def ReadSt : SchedRead;
+def ReadVecSt : SchedRead;
+
+// Branches
+def WriteBr : SchedWrite;
+def WriteBrL : SchedWrite;
+def ReadBr : SchedRead;
+
+// Floating Point ALU
+def WriteFPALU : SchedWrite;
+def ReadFPALU : SchedRead;
+
+// Floating Point MAC, Mul, Div, Sqrt
+//   Most processors will simply send all of these down a dedicated pipe, but
+//   they're explicitly seperated here for flexibility of modeling later. May
+//   consider consolidating them into a single WriteFPXXXX type in the future.
+def WriteFPMAC : SchedWrite;
+def WriteFPMul : SchedWrite;
+def WriteFPDiv : SchedWrite;
+def WriteFPSqrt : SchedWrite;
+def ReadFPMAC : SchedRead;
+def ReadFPMul : SchedRead;
+def ReadFPDiv : SchedRead;
+def ReadFPSqrt : SchedRead;
+
+// Noop
+def WriteNoop : SchedWrite;
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget specific Machine Models.
+
+include "AArch64ScheduleA53.td"
diff --git a/lib/Target/AArch64/AArch64ScheduleA53.td b/lib/Target/AArch64/AArch64ScheduleA53.td
new file mode 100644
index 0000000..20a14e7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ScheduleA53.td
@@ -0,0 +1,144 @@
+//=- AArch64ScheduleA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+                             // Specification - Instruction Timings"
+                             // v 1.0 Spreadsheet
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the default BufferSize = -1.
+// Cortex-A53 is in-order and therefore should be using BufferSize = 0. The
+// current configuration performs better with the basic latencies provided so
+// far. Will revisit BufferSize once the latency information is more accurate.
+
+let SchedModel = CortexA53Model in {
+
+def A53UnitALU    : ProcResource<2>;                        // Int ALU
+def A53UnitMAC    : ProcResource<1>;                        // Int MAC
+def A53UnitDiv    : ProcResource<1>;                        // Int Division
+def A53UnitLdSt   : ProcResource<1>;                        // Load/Store
+def A53UnitB      : ProcResource<1>;                        // Branch
+def A53UnitFPALU  : ProcResource<1>;                        // FP ALU
+def A53UnitFPMDS  : ProcResource<1>;                        // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+// Issue - Every instruction must consume an A53WriteIssue. Optionally,
+//         instructions that cannot be dual-issued will also include the
+//         A53WriteIssue2nd in their SchedRW list. That second WriteRes will
+//         ensure that a second issue slot is consumed.
+def A53WriteIssue : SchedWriteRes<[]>;
+def A53WriteIssue2nd : SchedWriteRes<[]> { let Latency = 0; }
+
+// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
+//       model forwarding logic. Once forwarding is properly modelled, then
+//       they'll be corrected.
+def : WriteRes<WriteALU, [A53UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteALUs, [A53UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteCMP, [A53UnitALU]> { let Latency = 1; }
+
+// MAC
+def : WriteRes<WriteMAC, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteDiv, [A53UnitDiv]> { let Latency = 4; }
+
+// Load - Note: Vector loads take 1-5 cycles to issue. For the WriteVecLd below,
+//        choosing the median of 3 which makes the latency 6. May model this more
+//        carefully in the future.
+def : WriteRes<WriteLd, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WritePreLd, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteVecLd, [A53UnitLdSt]> { let Latency = 6; }
+
+// Store - Note: Vector stores take 1-3 cycles to issue. For the ReadVecSt below,
+//         choosing the median of 2 which makes the latency 5. May model this more
+//         carefully in the future.
+def : WriteRes<WriteSt, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteVecSt, [A53UnitLdSt]> { let Latency = 5; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrL, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteFPALU, [A53UnitFPALU]> {let Latency = 6; }
+
+// FP MAC, Mul, Div, Sqrt
+//   Using Double Precision numbers for now as a worst case. Additionally, not
+//   modeling the exact hazard but instead treating the whole pipe as a hazard.
+//   As an example VMUL, VMLA, and others are actually pipelined. VDIV and VSQRT
+//   have a total latency of 33 and 32 respectively but only a hazard of 29 and
+//   28 (double-prescion example).
+def : WriteRes<WriteFPMAC, [A53UnitFPMDS]> { let Latency = 10; }
+def : WriteRes<WriteFPMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFPDiv, [A53UnitFPMDS]> { let Latency = 33;
+                                             let ResourceCycles = [29]; }
+def : WriteRes<WriteFPSqrt, [A53UnitFPMDS]> { let Latency = 32;
+                                              let ResourceCycles = [28]; }
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding defined for ReadALU yet.
+def : ReadAdvance<ReadALU, 0>;
+
+// No forwarding defined for ReadCMP yet.
+def : ReadAdvance<ReadCMP, 0>;
+
+// No forwarding defined for ReadBr yet.
+def : ReadAdvance<ReadBr, 0>;
+
+// No forwarding defined for ReadMAC yet.
+def : ReadAdvance<ReadMAC, 0>;
+
+// No forwarding defined for ReadDiv yet.
+def : ReadAdvance<ReadDiv, 0>;
+
+// No forwarding defined for ReadLd, ReadPreLd, ReadVecLd yet.
+def : ReadAdvance<ReadLd, 0>;
+def : ReadAdvance<ReadPreLd, 0>;
+def : ReadAdvance<ReadVecLd, 0>;
+
+// No forwarding defined for ReadSt and ReadVecSt yet.
+def : ReadAdvance<ReadSt, 0>;
+def : ReadAdvance<ReadVecSt, 0>;
+
+// No forwarding defined for ReadFPALU yet.
+def : ReadAdvance<ReadFPALU, 0>;
+
+// No forwarding defined for ReadFPMAC/Mul/Div/Sqrt yet.
+def : ReadAdvance<ReadFPMAC, 0>;
+def : ReadAdvance<ReadFPMul, 0>;
+def : ReadAdvance<ReadFPDiv, 0>;
+def : ReadAdvance<ReadFPSqrt, 0>;
+
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 5c693c1..9140bbd 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -14,10 +14,10 @@
 #include "AArch64Subtarget.h"
 #include "AArch64RegisterInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -28,9 +28,11 @@ using namespace llvm;
 // Pin the vtable to this file.
 void AArch64Subtarget::anchor() {}
 
-AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), HasFPARMv8(false), HasNEON(false),
-      HasCrypto(false), TargetTriple(TT), CPUString(CPU) {
+AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
+                                   bool LittleEndian)
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), TargetTriple(TT),
+      CPUString(CPU), IsLittleEndian(LittleEndian) {
 
   initializeSubtargetFeatures(CPU, FS);
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index bbfd3bc..68c6c4b 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -29,6 +29,11 @@ class GlobalValue;
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
   virtual void anchor();
 protected:
+  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57};
+
+  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
   bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
@@ -39,6 +44,9 @@ protected:
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
+  /// IsLittleEndian - The target is Little Endian
+  bool IsLittleEndian;
+
 private:
   void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
 
@@ -46,7 +54,8 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
-  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS);
+  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
+                   bool LittleEndian);
 
   virtual bool enableMachineScheduler() const {
     return true;
@@ -65,6 +74,8 @@ public:
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
 
+  bool isLittle() const { return IsLittleEndian; }
+
   const std::string & getCPUString() const { return CPUString; }
 };
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index f1695e2..d9c990d 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -16,31 +16,61 @@
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 extern "C" void LLVMInitializeAArch64Target() {
-  RegisterTargetMachine<AArch64TargetMachine> X(TheAArch64Target);
+  RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
 }
 
 AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
+                                           CodeGenOpt::Level OL,
+                                           bool LittleEndian)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS),
+    Subtarget(TT, CPU, FS, LittleEndian),
     InstrInfo(Subtarget),
-    DL("e-p:64:64-i64:64:64-i128:128:128-s0:32:32-f128:128:128-n32:64-S128"),
+    DL(LittleEndian ?
+       "e-m:e-i64:64-i128:128-n32:64-S128" :
+       "E-m:e-i64:64-i128:128-n32:64-S128"),
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget) {
   initAsmInfo();
 }
 
+void AArch64leTargetMachine::anchor() { }
+
+AArch64leTargetMachine::
+AArch64leTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void AArch64beTargetMachine::anchor() { }
+
+AArch64beTargetMachine::
+AArch64beTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
+  // allows the AArch64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAArch64TargetTransformInfoPass(this));
+}
+
 namespace {
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index c1f47c2..4297c92 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -36,7 +36,8 @@ public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
+                       CodeGenOpt::Level OL,
+                       bool LittleEndian);
 
   const AArch64InstrInfo *getInstrInfo() const {
     return &InstrInfo;
@@ -62,8 +63,32 @@ public:
     return &InstrInfo.getRegisterInfo();
   }
   TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  virtual void addAnalysisPasses(PassManagerBase &PM);
+};
+
+// AArch64leTargetMachine - AArch64 little endian target machine.
+//
+class AArch64leTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64leTargetMachine(const Target &T, StringRef TT,
+                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+                         Reloc::Model RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+// AArch64beTargetMachine - AArch64 big endian target machine.
+//
+class AArch64beTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64beTargetMachine(const Target &T, StringRef TT,
+                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+                         Reloc::Model RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
 };
 
-}
+} // End llvm namespace
 
 #endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index b4452f5..663d619 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -17,8 +17,8 @@
 using namespace llvm;
 
 void
-AArch64LinuxTargetObjectFile::Initialize(MCContext &Ctx,
-                                         const TargetMachine &TM) {
+AArch64ElfTargetObjectFile::Initialize(MCContext &Ctx,
+                                       const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index bf0565a..0f00a78 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -15,14 +15,14 @@
 #define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
-  /// AArch64LinuxTargetObjectFile - This implementation is used for linux
-  /// AArch64.
-  class AArch64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+  /// AArch64ElfTargetObjectFile - This implementation is used for ELF
+  /// AArch64 targets.
+  class AArch64ElfTargetObjectFile : public TargetLoweringObjectFileELF {
     virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
   };
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
new file mode 100644
index 0000000..e2a1647
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -0,0 +1,107 @@
+//===- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// AArch64 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64tti"
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeAArch64TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const AArch64Subtarget *ST;
+  const AArch64TargetLowering *TLI;
+
+public:
+  AArch64TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  AArch64TTI(const AArch64TargetMachine *TM)
+      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() override {
+    pushTTIStack(this);
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  /// @}
+
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 32;
+      return 0;
+    }
+    return 32;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+    return 64;
+  }
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
+                   "AArch64 Target Transform Info", true, true, false)
+char AArch64TTI::ID = 0;
+
+ImmutablePass *
+llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
+  return new AArch64TTI(TM);
+}
diff --git a/lib/Target/AArch64/Android.mk b/lib/Target/AArch64/Android.mk
index 3e1d50e..144c2d3 100644
--- a/lib/Target/AArch64/Android.mk
+++ b/lib/Target/AArch64/Android.mk
@@ -27,7 +27,8 @@ arm64_codegen_SRC_FILES := \
   AArch64ISelLowering.cpp \
   AArch64MCInstLower.cpp \
   AArch64SelectionDAGInfo.cpp \
-  AArch64TargetMachine.cpp
+  AArch64TargetMachine.cpp \
+  AArch64TargetTransformInfo.cpp
 
 # For the host
 # =====================================================
@@ -48,6 +49,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -62,3 +64,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index fbbce11..e933ec1 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -18,21 +18,21 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -109,6 +109,9 @@ public:
   OperandMatchResultTy
   ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
+  OperandMatchResultTy
+  ParseFPImm0AndImm0Operand( SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
   template<typename SomeNamedImmMapper> OperandMatchResultTy
   ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
@@ -826,6 +829,10 @@ public:
     return CE->getValue() == N;
   }
 
+  bool isFPZeroIZero() const {
+    return isFPZero();
+  }
+
   static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
                                           unsigned ShiftAmount,
                                           bool ImplicitAmount,
@@ -965,6 +972,10 @@ public:
     Inst.addOperand(MCOperand::CreateImm(0));
   }
 
+  void addFPZeroIZeroOperands(MCInst &Inst, unsigned N) const {
+    addFPZeroOperands(Inst, N);
+  }
+
   void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned Encoded = A64InvertCondCode(getCondCode());
@@ -1470,12 +1481,14 @@ AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::ParseImmWithLSLOperand(
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // FIXME?: I want to live in a world where immediates must start with
-  // #. Please don't dash my hopes (well, do if you have a good reason).
-  if (Parser.getTok().isNot(AsmToken::Hash)) return MatchOperand_NoMatch;
 
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '#'
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat '#'
+  else if (Parser.getTok().isNot(AsmToken::Integer))
+    // Operand should start from # or should be integer, emit error otherwise.
+    return MatchOperand_NoMatch;
 
   const MCExpr *Imm;
   if (ParseImmediate(Imm) != MatchOperand_Success)
@@ -1574,12 +1587,13 @@ AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::ParseFPImmOperand(
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
-  // FIXME?: I want to live in a world where immediates must start with
-  // #. Please don't dash my hopes (well, do if you have a good reason).
-  if (Parser.getTok().isNot(AsmToken::Hash)) return MatchOperand_NoMatch;
-
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '#'
+
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
+  }
 
   bool Negative = false;
   if (Parser.getTok().is(AsmToken::Minus)) {
@@ -1590,6 +1604,8 @@ AArch64AsmParser::ParseFPImmOperand(
   }
 
   if (Parser.getTok().isNot(AsmToken::Real)) {
+    if (!Hash)
+      return MatchOperand_NoMatch;
     Error(S, "Expected floating-point immediate");
     return MatchOperand_ParseFail;
   }
@@ -1605,6 +1621,44 @@ AArch64AsmParser::ParseFPImmOperand(
   return MatchOperand_Success;
 }
 
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseFPImm0AndImm0Operand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
+  }
+
+  APFloat RealVal(0.0);
+  if (Parser.getTok().is(AsmToken::Real)) {
+    if(Parser.getTok().getString() != "0.0") {
+      Error(S, "only #0.0 is acceptable as immediate");
+      return MatchOperand_ParseFail;
+    }
+  }
+  else if (Parser.getTok().is(AsmToken::Integer)) {
+    if(Parser.getTok().getIntVal() != 0) {
+      Error(S, "only #0.0 is acceptable as immediate");
+      return MatchOperand_ParseFail;
+    }
+  }
+  else {
+    if (!Hash)
+      return MatchOperand_NoMatch;
+    Error(S, "only #0.0 is acceptable as immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat real number
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateFPImm(0.0, S, E));
+  return MatchOperand_Success;
+}
 
 // Automatically generated
 static unsigned MatchRegisterName(StringRef Name);
@@ -2192,15 +2246,36 @@ validateInstruction(MCInst &Inst,
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  size_t CondCodePos = Name.find('.');
-
-  StringRef Mnemonic = Name.substr(0, CondCodePos);
+  StringRef PatchedName = StringSwitch<StringRef>(Name.lower())
+    .Case("beq", "b.eq")
+    .Case("bne", "b.ne")
+    .Case("bhs", "b.hs")
+    .Case("bcs", "b.cs")
+    .Case("blo", "b.lo")
+    .Case("bcc", "b.cc")
+    .Case("bmi", "b.mi")
+    .Case("bpl", "b.pl")
+    .Case("bvs", "b.vs")
+    .Case("bvc", "b.vc")
+    .Case("bhi", "b.hi")
+    .Case("bls", "b.ls")
+    .Case("bge", "b.ge")
+    .Case("blt", "b.lt")
+    .Case("bgt", "b.gt")
+    .Case("ble", "b.le")
+    .Case("bal", "b.al")
+    .Case("bnv", "b.nv")
+    .Default(Name);
+
+  size_t CondCodePos = PatchedName.find('.');
+
+  StringRef Mnemonic = PatchedName.substr(0, CondCodePos);
   Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
 
   if (CondCodePos != StringRef::npos) {
     // We have a condition code
     SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
-    StringRef CondStr = Name.substr(CondCodePos + 1, StringRef::npos);
+    StringRef CondStr = PatchedName.substr(CondCodePos + 1, StringRef::npos);
     A64CC::CondCodes Code;
 
     Code = A64StringToCondCode(CondStr);
@@ -2291,7 +2366,7 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
     for (;;) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
-        return true;
+        return false;
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -2299,8 +2374,10 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
       Parser.Lex();
     }
   }
@@ -2313,8 +2390,10 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 //   ::= .tlsdesccall symbol
 bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
   StringRef Name;
-  if (getParser().parseIdentifier(Name))
-    return Error(L, "expected symbol after directive");
+  if (getParser().parseIdentifier(Name)) {
+    Error(L, "expected symbol after directive");
+    return false;
+  }
 
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
   const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
@@ -2323,7 +2402,7 @@ bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
   Inst.setOpcode(AArch64::TLSDESCCALL);
   Inst.addOperand(MCOperand::CreateExpr(Expr));
 
-  getParser().getStreamer().EmitInstruction(Inst);
+  getParser().getStreamer().EmitInstruction(Inst, STI);
   return false;
 }
 
@@ -2346,7 +2425,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (validateInstruction(Inst, Operands))
       return true;
 
-    Out.EmitInstruction(Inst);
+    Out.EmitInstruction(Inst, STI);
     return false;
   case Match_MissingFeature:
     Error(IDLoc, "instruction requires a CPU feature not currently enabled");
@@ -2436,10 +2515,10 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                  "expected integer multiple of 4 in range [-256, 252]");
   case Match_LoadStoreSImm7_8:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 8 in range [-512, 508]");
+                 "expected integer multiple of 8 in range [-512, 504]");
   case Match_LoadStoreSImm7_16:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 16 in range [-1024, 1016]");
+                 "expected integer multiple of 16 in range [-1024, 1008]");
   case Match_LoadStoreSImm9:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
                  "expected integer in range [-256, 255]");
@@ -2588,7 +2667,8 @@ void AArch64Operand::dump() const {
 
 /// Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmParser() {
-  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64Target);
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
 }
 
 #define GET_REGISTER_MATCHER
diff --git a/lib/Target/AArch64/AsmParser/Android.mk b/lib/Target/AArch64/AsmParser/Android.mk
index 899c84d..0588511 100644
--- a/lib/Target/AArch64/AsmParser/Android.mk
+++ b/lib/Target/AArch64/AsmParser/Android.mk
@@ -46,6 +46,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 #===---------------------------------------------------------------===
 # libARM64AsmParser (target)
 #===---------------------------------------------------------------===
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -59,3 +60,4 @@ TBLGEN_TD_DIR := $(arm64_asm_parser_TBLGEN_TD_DIR)
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
index a018a0a..e81ec70 100644
--- a/lib/Target/AArch64/AsmParser/CMakeLists.txt
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMAArch64AsmParser
   AArch64AsmParser.cpp
   )
-
-add_dependencies(LLVMAArch64AsmParser AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
index bd1fcaf..2d8f632 100644
--- a/lib/Target/AArch64/AsmParser/LLVMBuild.txt
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -19,6 +19,5 @@
 type = Library
 name = AArch64AsmParser
 parent = AArch64
-required_libraries = AArch64Desc AArch64Info MC MCParser Support
+required_libraries = AArch64Desc AArch64Info AArch64Utils MC MCParser Support
 add_to_library_groups = AArch64
-
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 0f2e816..dfc10af 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -26,13 +26,12 @@ add_llvm_target(AArch64CodeGen
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
+  AArch64TargetTransformInfo.cpp
   )
 
-add_dependencies(LLVMAArch64CodeGen AArch64CommonTableGen)
-
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)
-add_subdirectory(Utils)
-\ No newline at end of file
+add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index be4d7f2..9bd363a 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -19,15 +19,15 @@
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -245,7 +245,6 @@ static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
 static bool Check(DecodeStatus &Out, DecodeStatus In);
 
 #include "AArch64GenDisassemblerTables.inc"
-#include "AArch64GenInstrInfo.inc"
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -997,7 +996,9 @@ static MCDisassembler *createAArch64Disassembler(const Target &T,
 }
 
 extern "C" void LLVMInitializeAArch64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheAArch64Target,
+  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
                                          createAArch64Disassembler);
 }
 
@@ -1517,7 +1518,7 @@ static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
   unsigned Q = fieldFromInstruction(Insn, 30, 1);
   unsigned S = fieldFromInstruction(Insn, 10, 3);
   unsigned lane = 0;
-  // Calculate the number of lanes by number of vectors and transfered bytes.
+  // Calculate the number of lanes by number of vectors and transferred bytes.
   // NumLanes = 16 bytes / bytes of each lane
   unsigned NumLanes = 16 / (TransferBytes / NumVecs);
   switch (NumLanes) {
diff --git a/lib/Target/AArch64/Disassembler/Android.mk b/lib/Target/AArch64/Disassembler/Android.mk
index f7d888a..fcc53ad 100644
--- a/lib/Target/AArch64/Disassembler/Android.mk
+++ b/lib/Target/AArch64/Disassembler/Android.mk
@@ -11,6 +11,7 @@ arm64_disassembler_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -29,6 +30,7 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
index d4bd163..21baf25 100644
--- a/lib/Target/AArch64/Disassembler/CMakeLists.txt
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMAArch64Disassembler
   AArch64Disassembler.cpp
   )
-
-add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index a93e343..05c4ed1 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -19,6 +19,5 @@
 type = Library
 name = AArch64Disassembler
 parent = AArch64
-required_libraries = AArch64CodeGen AArch64Desc AArch64Info AArch64Utils MC Support
+required_libraries = AArch64Info AArch64Utils MC Support
 add_to_library_groups = AArch64
-
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 0438de3..fd3f009 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -15,8 +15,8 @@
 #include "AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
diff --git a/lib/Target/AArch64/InstPrinter/Android.mk b/lib/Target/AArch64/InstPrinter/Android.mk
index ffe94d3..ac9b0df 100644
--- a/lib/Target/AArch64/InstPrinter/Android.mk
+++ b/lib/Target/AArch64/InstPrinter/Android.mk
@@ -33,6 +33,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -52,4 +53,5 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
index d4b980a..3db56e4 100644
--- a/lib/Target/AArch64/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -1,8 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMAArch64AsmPrinter
   AArch64InstPrinter.cpp
   )
-
-add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
-
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 6e4ce8b..4c8f101 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,6 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AsmPrinter CodeGen Core MC SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils AsmPrinter CodeGen Core MC SelectionDAG Support Target
 add_to_library_groups = AArch64
-
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 8a9077c..f1452ab 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -15,10 +15,10 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -43,7 +43,7 @@ public:
   virtual void processFixupValue(const MCAssembler &Asm,
                                  const MCAsmLayout &Layout,
                                  const MCFixup &Fixup, const MCFragment *DF,
-                                 MCValue &Target, uint64_t &Value,
+                                 const MCValue &Target, uint64_t &Value,
                                  bool &IsResolved);
 };
 } // end anonymous namespace
@@ -52,8 +52,8 @@ void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
                                           const MCAsmLayout &Layout,
                                           const MCFixup &Fixup,
                                           const MCFragment *DF,
-                                          MCValue &Target, uint64_t &Value,
-                                          bool &IsResolved) {
+                                          const MCValue &Target,
+                                          uint64_t &Value, bool &IsResolved) {
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
   // ~0xfff. This means that the required offset to reach a symbol can vary by
   // up to one step depending on where the ADRP is in memory. For example:
@@ -79,11 +79,12 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
 namespace {
 
 class ELFAArch64AsmBackend : public AArch64AsmBackend {
-public:
   uint8_t OSABI;
+  bool IsLittle; // Big or little endian
+public:
   ELFAArch64AsmBackend(const Target &T, const StringRef TT,
-                       uint8_t _OSABI)
-    : AArch64AsmBackend(T, TT), OSABI(_OSABI) { }
+                       uint8_t _OSABI, bool isLittle)
+    : AArch64AsmBackend(T, TT), OSABI(_OSABI), IsLittle(isLittle) { }
 
   bool fixupNeedsRelaxation(const MCFixup &Fixup,
                             uint64_t Value,
@@ -176,7 +177,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const {
+                  uint64_t Value, bool IsPCRel) const {
     unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
@@ -200,7 +201,7 @@ public:
   }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createAArch64ELFObjectWriter(OS, OSABI);
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittle);
   }
 };
 
@@ -578,8 +579,15 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
 }
 
 MCAsmBackend *
-llvm::createAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                              StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ true);
+}
+
+MCAsmBackend *
+llvm::createAArch64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                               StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS());
+  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ false);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 4bcc65d..a5fe914 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -23,19 +23,19 @@ using namespace llvm;
 namespace {
 class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  AArch64ELFObjectWriter(uint8_t OSABI);
+  AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian);
 
   virtual ~AArch64ELFObjectWriter();
 
 protected:
-  virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                bool IsPCRel, bool IsRelocWithSymbol,
-                                int64_t Addend) const;
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+
 private:
 };
 }
 
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI)
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
   : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
                             /*HasRelocationAddend*/ true)
 {}
@@ -45,9 +45,7 @@ AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
 
 unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
                                               const MCFixup &Fixup,
-                                              bool IsPCRel,
-                                              bool IsRelocWithSymbol,
-                                              int64_t Addend) const {
+                                              bool IsPCRel) const {
   unsigned Type;
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
@@ -286,7 +284,8 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
 }
 
 MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
-                                                   uint8_t OSABI) {
-  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+                                                   uint8_t OSABI,
+                                                   bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS,  IsLittleEndian);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index a64c463..473b7dd 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -57,7 +57,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
 public:
   AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
                      MCCodeEmitter *Emitter)
-      : MCELFStreamer(Context, 0, TAB, OS, Emitter), MappingSymbolCounter(0),
+      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
         LastEMS(EMS_None) {}
 
   ~AArch64ELFStreamer() {}
@@ -76,9 +76,9 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  virtual void EmitInstruction(const MCInst& Inst) {
+  virtual void EmitInstruction(const MCInst& Inst, const MCSubtargetInfo &STI) {
     EmitA64MappingSymbol();
-    MCELFStreamer::EmitInstruction(Inst);
+    MCELFStreamer::EmitInstruction(Inst, STI);
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index add874c..b090a55 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -12,30 +12,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
 
 using namespace llvm;
 
-AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() {
+AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::aarch64_be)
+    IsLittleEndian = false;
+
   PointerSize = 8;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
   CommentString = "//";
-  PrivateGlobalPrefix = ".L";
   Code32Directive = ".code\t32";
 
   Data16bitsDirective = "\t.hword\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = "\t.xword\t";
 
-  UseDataRegionDirectives = true;
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
 }
 
 // Pin the vtable to this file.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index d1dd285..43c0e47 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -19,7 +19,7 @@
 namespace llvm {
 
 struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
-  explicit AArch64ELFMCAsmInfo();
+  explicit AArch64ELFMCAsmInfo(StringRef TT);
 private:
   virtual void anchor();
 };
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index b41c566..b9a61ef 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -39,72 +39,92 @@ public:
   ~AArch64MCCodeEmitter() {}
 
   unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   template<int MemSize>
   unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups) const {
-    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, MemSize);
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, STI, MemSize);
   }
 
   unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
                                     SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI,
                                     int MemSize) const;
 
   unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
   unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
 
   unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups) const;
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
   unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
   // Labels are handled mostly the same way: a symbol is needed, and
   // just gets some fixup attached.
   template<AArch64::Fixups fixupDesired>
   unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
 
   unsigned  getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
 
   unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
 
   unsigned getAddressWithFixup(const MCOperand &MO,
                                unsigned FixupKind,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
   /// getMachineOpValue - Return binary encoding of operand. If the machine
   /// operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
 
   void EmitByte(unsigned char C, raw_ostream &OS) const {
@@ -121,14 +141,18 @@ public:
 
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
 
   template<int hasRs, int hasRt2> unsigned
-  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue) const;
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
 
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue) const;
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
 
-  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue) const;
+  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
+                      const MCSubtargetInfo &STI) const;
 
 
 };
@@ -137,7 +161,8 @@ public:
 
 unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
                                        unsigned FixupKind,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   if (!MO.isExpr()) {
     // This can occur for manually decoded or constructed MCInsts, but neither
     // the assembly-parser nor instruction selection will currently produce an
@@ -156,6 +181,7 @@ unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
 unsigned AArch64MCCodeEmitter::
 getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
                        SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI,
                        int MemSize) const {
   const MCOperand &ImmOp = MI.getOperand(OpIdx);
   if (ImmOp.isImm())
@@ -236,12 +262,13 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
     break;
   }
 
-  return getAddressWithFixup(ImmOp, FixupKind, Fixups);
+  return getAddressWithFixup(ImmOp, FixupKind, Fixups, STI);
 }
 
 unsigned
 AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
@@ -269,12 +296,13 @@ AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
     FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
   }
 
-  return getAddressWithFixup(MO, FixupKind, Fixups);
+  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
 }
 
 unsigned
 AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (MO.isImm())
@@ -304,12 +332,13 @@ AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
     llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
   }
 
-  return getAddressWithFixup(MO, FixupKind, Fixups);
+  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
 }
 
 unsigned
 AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpIdx);
   assert(MO.isImm() && "Only immediate expected for shift");
@@ -319,7 +348,8 @@ AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
 
 unsigned
 AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpIdx);
   assert(MO.isImm() && "Only immediate expected for shift");
@@ -328,53 +358,62 @@ AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
 }
 
 unsigned AArch64MCCodeEmitter::getShiftRightImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return 8 - MI.getOperand(Op).getImm();
 }
 
 unsigned AArch64MCCodeEmitter::getShiftRightImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return 16 - MI.getOperand(Op).getImm();
 }
 
 unsigned AArch64MCCodeEmitter::getShiftRightImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return 32 - MI.getOperand(Op).getImm();
 }
 
 unsigned AArch64MCCodeEmitter::getShiftRightImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return 64 - MI.getOperand(Op).getImm();
 }
 
 unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return MI.getOperand(Op).getImm() - 8;
 }
 
 unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return MI.getOperand(Op).getImm() - 16;
 }
 
 unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return MI.getOperand(Op).getImm() - 32;
 }
 
 unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   return MI.getOperand(Op).getImm() - 64;
 }
 
 template<AArch64::Fixups fixupDesired> unsigned
 AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
                                       unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups) const {
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
   if (MO.isExpr())
-    return getAddressWithFixup(MO, fixupDesired, Fixups);
+    return getAddressWithFixup(MO, fixupDesired, Fixups, STI);
 
   assert(MO.isImm());
   return MO.getImm();
@@ -383,7 +422,8 @@ AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
 unsigned
 AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
                                        unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
   if (MO.isImm())
@@ -401,14 +441,15 @@ AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
     FixupKind = AArch64::fixup_a64_ld_prel;
   }
 
-  return getAddressWithFixup(MO, FixupKind, Fixups);
+  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
 }
 
 
 unsigned
 AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                        const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
   } else if (MO.isImm()) {
@@ -421,7 +462,8 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
 unsigned
 AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   const MCOperand &UImm16MO = MI.getOperand(OpIdx);
   const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
 
@@ -482,12 +524,13 @@ AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
     requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
   }
 
-  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups);
+  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups, STI);
 }
 
 template<int hasRs, int hasRt2> unsigned
 AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
-                                            unsigned EncodedValue) const {
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
   if (!hasRs) EncodedValue |= 0x001F0000;
   if (!hasRt2) EncodedValue |= 0x00007C00;
 
@@ -495,7 +538,8 @@ AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
 }
 
 unsigned
-AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue) const {
+AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                              const MCSubtargetInfo &STI) const {
   // If one of the signed fixup kinds is applied to a MOVZ instruction, the
   // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
   // job to ensure that any bits possibly affected by this are 0. This means we
@@ -529,7 +573,8 @@ AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue) const {
 
 unsigned
 AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
-                                 unsigned EncodedValue) const {
+                                 unsigned EncodedValue,
+                                 const MCSubtargetInfo &STI) const {
   // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
   // (i.e. all bits 1) but is ignored by the processor.
   EncodedValue |= 0x1f << 10;
@@ -545,7 +590,8 @@ MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
 
 void AArch64MCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   if (MI.getOpcode() == AArch64::TLSDESCCALL) {
     // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
     // following (BLR) instruction. It doesn't emit any code itself so it
@@ -557,7 +603,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     return;
   }
 
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
 
   EmitInstruction(Binary, OS);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index c1abfe7..c7ccaee 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -14,8 +14,8 @@
 
 #define DEBUG_TYPE "aarch64mcexpr"
 #include "AArch64MCExpr.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
 #include "llvm/Object/ELF.h"
 
@@ -79,7 +79,7 @@ void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
 bool
 AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAsmLayout *Layout) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, *Layout);
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 58fc95c..3d19e42 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -22,8 +22,8 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_REGINFO_MC_DESC
 #include "AArch64GenRegisterInfo.inc"
@@ -61,7 +61,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   Triple TheTriple(TT);
 
-  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo();
+  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
   MAI->addInitialFrameState(Inst);
@@ -97,6 +97,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
                                     raw_ostream &OS,
                                     MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
@@ -160,42 +161,61 @@ static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
 
 extern "C" void LLVMInitializeAArch64TargetMC() {
   // Register the MC asm info.
-  RegisterMCAsmInfoFn A(TheAArch64Target, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn A(TheAArch64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn B(TheAArch64beTarget, createAArch64MCAsmInfo);
 
   // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64Target,
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
                                         createAArch64MCCodeGenInfo);
 
   // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheAArch64Target,
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
                                       createAArch64MCInstrInfo);
 
   // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheAArch64Target,
+  TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
                                     createAArch64MCRegisterInfo);
 
   // Register the MC subtarget info.
   using AArch64_MC::createAArch64MCSubtargetInfo;
-  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64Target,
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
                                           createAArch64MCSubtargetInfo);
 
   // Register the MC instruction analyzer.
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64Target,
+  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64leTarget,
+                                          createAArch64MCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64beTarget,
                                           createAArch64MCInstrAnalysis);
 
   // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheAArch64Target,
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
                                         createAArch64MCCodeEmitter);
 
   // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64Target,
-                                       createAArch64AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
+                                       createAArch64beAsmBackend);
 
   // Register the object streamer.
-  TargetRegistry::RegisterMCObjectStreamer(TheAArch64Target,
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
                                            createMCStreamer);
 
   // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheAArch64Target,
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
                                         createAArch64MCInstPrinter);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 670e657..bd8beaf 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -28,7 +28,8 @@ class StringRef;
 class Target;
 class raw_ostream;
 
-extern Target TheAArch64Target;
+extern Target TheAArch64leTarget;
+extern Target TheAArch64beTarget;
 
 namespace AArch64_MC {
   MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -41,11 +42,16 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
                                           MCContext &Ctx);
 
 MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
-                                             uint8_t OSABI);
+                                             uint8_t OSABI,
+                                             bool IsLittleEndian);
 
-MCAsmBackend *createAArch64AsmBackend(const Target &T,
-                                      const MCRegisterInfo &MRI,
-                                      StringRef TT, StringRef CPU);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU);
+
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU);
 
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/MCTargetDesc/Android.mk b/lib/Target/AArch64/MCTargetDesc/Android.mk
index 0dcc84f..edcf1f2 100644
--- a/lib/Target/AArch64/MCTargetDesc/Android.mk
+++ b/lib/Target/AArch64/MCTargetDesc/Android.mk
@@ -37,6 +37,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -53,3 +54,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 44c66a2..54c4465 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -7,7 +7,3 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCExpr.cpp
   AArch64MCTargetDesc.cpp
   )
-add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 377b533..9281e4e 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -16,9 +16,12 @@
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::TheAArch64Target;
+Target llvm::TheAArch64leTarget;
+Target llvm::TheAArch64beTarget;
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
     RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
-    X(TheAArch64Target, "aarch64", "AArch64 (ARM 64-bit target)");
+    X(TheAArch64leTarget, "aarch64", "AArch64 (ARM 64-bit little endian target)");
+    RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true>
+    Y(TheAArch64beTarget, "aarch64_be", "AArch64 (ARM 64-bit big endian target)");
 }
diff --git a/lib/Target/AArch64/TargetInfo/Android.mk b/lib/Target/AArch64/TargetInfo/Android.mk
index e163329..cc650f6 100644
--- a/lib/Target/AArch64/TargetInfo/Android.mk
+++ b/lib/Target/AArch64/TargetInfo/Android.mk
@@ -32,6 +32,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -51,3 +52,4 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
index e236eed..ee734c6 100644
--- a/lib/Target/AArch64/TargetInfo/CMakeLists.txt
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMAArch64Info
   AArch64TargetInfo.cpp
   )
-
-add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
index 5b003f0..6429172 100644
--- a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -19,6 +19,5 @@
 type = Library
 name = AArch64Info
 parent = AArch64
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = AArch64
-
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index ce970b0..39b042b 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -17,8 +17,8 @@
 #ifndef LLVM_AARCH64_BASEINFO_H
 #define LLVM_AARCH64_BASEINFO_H
 
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
diff --git a/lib/Target/AArch64/Utils/Android.mk b/lib/Target/AArch64/Utils/Android.mk
index a94f5c6..b8bf795 100644
--- a/lib/Target/AArch64/Utils/Android.mk
+++ b/lib/Target/AArch64/Utils/Android.mk
@@ -5,6 +5,7 @@ arm64_utils_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(arm64_utils_SRC_FILES)
@@ -17,6 +18,7 @@ LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_DEVICE_BUILD_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt
index 2348e44..8ee03a7 100644
--- a/lib/Target/AArch64/Utils/CMakeLists.txt
+++ b/lib/Target/AArch64/Utils/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMAArch64Utils
   AArch64BaseInfo.cpp
   )
-
-add_dependencies(LLVMAArch64Utils AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
index 1be5375..4acecc9 100644
--- a/lib/Target/AArch64/Utils/LLVMBuild.txt
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = AArch64Utils
 parent = AArch64
-required_libraries = Core Support
+required_libraries = Support
 add_to_library_groups = AArch64
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index ff585b4..28ea879 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -27,21 +27,14 @@
 #define DEBUG_TYPE "a15-sd-optimizer"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMSubtarget.h"
-#include "ARMISelLowering.h"
-#include "ARMTargetMachine.h"
-
-#include "llvm/ADT/SmallPtrSet.h"
+#include "ARMBaseRegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-
 #include <set>
 
 using namespace llvm;
@@ -51,9 +44,9 @@ namespace {
     static char ID;
     A15SDOptimizer() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM A15 S->D optimizer";
     }
 
@@ -165,7 +158,7 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
   if (!MI) return ARM::ssub_0;
   MachineOperand *MO = MI->findRegisterDefOperand(SReg);
 
-  assert(MO->isReg() && "Non register operand found!");
+  assert(MO->isReg() && "Non-register operand found!");
   if (!MO) return ARM::ssub_0;
 
   if (MI->isCopy() && usesRegClass(MI->getOperand(1),
@@ -227,9 +220,9 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
           IsDead = false;
           break;
         }
-        for (MachineRegisterInfo::use_iterator II = MRI->use_begin(Reg),
-                            EE = MRI->use_end();
-                            II != EE; ++II) {
+        for (MachineRegisterInfo::use_instr_iterator
+             II = MRI->use_instr_begin(Reg), EE = MRI->use_instr_end();
+             II != EE; ++II) {
           // We don't care about self references.
           if (&*II == Def)
             continue;
@@ -418,7 +411,8 @@ SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
     if (!MO.isReg() || !MO.isUse())
       continue;
     if (!usesRegClass(MO, &ARM::DPRRegClass) &&
-        !usesRegClass(MO, &ARM::QPRRegClass))
+        !usesRegClass(MO, &ARM::QPRRegClass) &&
+        !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
       continue;
 
     Defs.push_back(MO.getReg());
@@ -538,7 +532,10 @@ A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
   InsertPt++;
   unsigned Out;
 
-  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass)) {
+  // DPair has the same length as QPR and also has two DPRs as subreg.
+  // Treat DPair as QPR.
+  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
+      MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
     unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
                                          ARM::dsub_0, &ARM::DPRRegClass);
     unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
@@ -571,7 +568,9 @@ A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
       default: llvm_unreachable("Unknown preferred lane!");
     }
 
-    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass);
+    // Treat DPair as QPR
+    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
+                   usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);
 
     Out = createImplicitDef(MBB, InsertPt, DL);
     Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
@@ -648,7 +647,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
       unsigned DPRDefReg = MI->getOperand(0).getReg();
       for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
              E = MRI->use_end(); I != E; ++I)
-        Uses.push_back(&I.getOperand());
+        Uses.push_back(&*I);
 
       // We can optimize this.
       unsigned NewReg = optimizeSDPattern(MI);
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 80e5f37..4412b45 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -15,19 +15,19 @@
 #ifndef TARGET_ARM_H
 #define TARGET_ARM_H
 
-#include "MCTargetDesc/ARMBaseInfo.h"
-#include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CodeGen.h"
 
 namespace llvm {
 
 class ARMAsmPrinter;
 class ARMBaseTargetMachine;
 class FunctionPass;
+class ImmutablePass;
 class JITCodeEmitter;
 class MachineInstr;
 class MCInst;
+class TargetLowering;
+class TargetMachine;
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -43,11 +43,14 @@ FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createARMOptimizeBarriersPass();
 FunctionPass *createThumb2SizeReductionPass();
 
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM);
 
+FunctionPass *createARMAtomicExpandPass(const TargetMachine *TM);
+
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b73e981..25385a6 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -73,6 +73,11 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
                           "Enable support for CRC instructions">;
 
+// Cyclone has preferred instructions for zeroing VFP registers, which can
+// execute in 0 cycles.
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
@@ -179,7 +184,14 @@ def ProcA5      : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
                                    "Cortex-A5 ARM processors",
                                    [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
                                     FeatureVMLxForwarding, FeatureT2XtPk,
-                                    FeatureTrustZone]>;
+                                    FeatureTrustZone, FeatureMP]>;
+def ProcA7      : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7",
+                                   "Cortex-A7 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
+                                    FeatureVMLxForwarding, FeatureT2XtPk,
+                                    FeatureVFP4, FeatureMP,
+                                    FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureVirtualization]>;
 def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
                                    "Cortex-A8 ARM processors",
                                    [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
@@ -198,6 +210,15 @@ def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
                                     FeatureHWDivARM, FeatureAvoidPartialCPSR,
                                     FeatureAvoidMOVsShOp,
                                     FeatureHasSlowFPVMLx, FeatureTrustZone]>;
+def ProcA12     : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12",
+                                   "Cortex-A12 ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureVFP4,
+                                    FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureVirtualization,
+                                    FeatureTrustZone]>;
+
 
 // FIXME: It has not been determined if A15 has these features.
 def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
@@ -227,11 +248,29 @@ def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                     FeatureAvoidPartialCPSR,
                                     FeatureT2XtPk]>;
 
+// FIXME: krait has currently the same features as A9
+// plus VFP4 and hardware division features.
+def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
+                                   "Qualcomm ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureTrustZone,
+                                    FeatureVFP4,
+                                    FeatureHWDiv,
+                                    FeatureHWDivARM]>;
+
+
+def FeatureAPCS  : SubtargetFeature<"apcs", "TargetABI", "ARM_ABI_APCS",
+                                   "Use the APCS ABI">;
+
+def FeatureAAPCS : SubtargetFeature<"aapcs", "TargetABI", "ARM_ABI_AAPCS",
+                                   "Use the AAPCS ABI">;
+
 // RenderScript-specific support for 64-bit long types on all targets
 def FeatureLong64 : SubtargetFeature<"long64", "UseLong64",
-                                        "true",
-                                        "long type is forced to be 64-bit">;
-
+                                     "true",
+                                     "long type is forced to be 64-bit">;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
@@ -302,6 +341,10 @@ def : ProcessorModel<"cortex-a5",   CortexA8Model,
                                     [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureVFP4, FeatureDSPThumb2,
                                      FeatureHasRAS, FeatureAClass]>;
+def : ProcessorModel<"cortex-a7",   CortexA8Model,
+                                    [ProcA7, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a8",   CortexA8Model,
                                     [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS,
@@ -314,11 +357,26 @@ def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
                                      FeatureHasRAS, FeatureAClass]>;
+
+// FIXME: A12 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a12", CortexA9Model,
+                                    [ProcA12, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureMP,
+                                     FeatureHasRAS, FeatureAClass]>;
+
 // FIXME: A15 has currently the same ProcessorModel as A9.
 def : ProcessorModel<"cortex-a15",   CortexA9Model,
                                     [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS,
                                      FeatureAClass]>;
+
+// FIXME: krait has currently the same Schedule model as A9
+def : ProcessorModel<"krait",       CortexA9Model,
+                                    [ProcKrait, HasV7Ops,
+                                     FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
+
 // FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,
                                     [ProcR5, HasV7Ops, FeatureDB,
@@ -353,6 +411,13 @@ def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
                                     FeatureDB, FeatureFPARMv8,
                                     FeatureNEON, FeatureDSPThumb2]>;
 
+// Cyclone is very similar to swift
+def : ProcessorModel<"cyclone",     SwiftModel,
+                                    [ProcSwift, HasV8Ops, HasV7Ops,
+                                     FeatureCrypto, FeatureFPARMv8,
+                                     FeatureDB,FeatureDSPThumb2,
+                                     FeatureHasRAS, FeatureZCZeroing]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -369,17 +434,6 @@ include "ARMInstrInfo.td"
 
 def ARMInstrInfo : InstrInfo;
 
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// ARM Uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def ARMAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
@@ -387,6 +441,4 @@ def ARMAsmWriter : AsmWriter {
 def ARM : Target {
   // Pull in Instruction Info:
   let InstructionSet = ARMInstrInfo;
-
-  let AssemblyWriters = [ARMAsmWriter];
 }
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index e79f88d..0fa865f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -15,7 +15,6 @@
 #define DEBUG_TYPE "asm-printer"
 #include "ARMAsmPrinter.h"
 #include "ARM.h"
-#include "ARMBuildAttrs.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMFPUName.h"
 #include "ARMMachineFunctionInfo.h"
@@ -26,13 +25,13 @@
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -45,81 +44,17 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cctype>
 using namespace llvm;
 
-/// EmitDwarfRegOp - Emit dwarf register operation.
-void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc,
-                                   bool Indirect) const {
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) {
-    AsmPrinter::EmitDwarfRegOp(MLoc, Indirect);
-    return;
-  }
-  assert(MLoc.isReg() && !Indirect &&
-         "This doesn't support offset/indirection - implement it if needed");
-  unsigned Reg = MLoc.getReg();
-  if (Reg >= ARM::S0 && Reg <= ARM::S31) {
-    assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
-    // S registers are described as bit-pieces of a register
-    // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
-    // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
-
-    unsigned SReg = Reg - ARM::S0;
-    bool odd = SReg & 0x1;
-    unsigned Rx = 256 + (SReg >> 1);
-
-    OutStreamer.AddComment("DW_OP_regx for S register");
-    EmitInt8(dwarf::DW_OP_regx);
-
-    OutStreamer.AddComment(Twine(SReg));
-    EmitULEB128(Rx);
-
-    if (odd) {
-      OutStreamer.AddComment("DW_OP_bit_piece 32 32");
-      EmitInt8(dwarf::DW_OP_bit_piece);
-      EmitULEB128(32);
-      EmitULEB128(32);
-    } else {
-      OutStreamer.AddComment("DW_OP_bit_piece 32 0");
-      EmitInt8(dwarf::DW_OP_bit_piece);
-      EmitULEB128(32);
-      EmitULEB128(0);
-    }
-  } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
-    assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
-    // Q registers Q0-Q15 are described by composing two D registers together.
-    // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
-    // DW_OP_piece(8)
-
-    unsigned QReg = Reg - ARM::Q0;
-    unsigned D1 = 256 + 2 * QReg;
-    unsigned D2 = D1 + 1;
-
-    OutStreamer.AddComment("DW_OP_regx for Q register: D1");
-    EmitInt8(dwarf::DW_OP_regx);
-    EmitULEB128(D1);
-    OutStreamer.AddComment("DW_OP_piece 8");
-    EmitInt8(dwarf::DW_OP_piece);
-    EmitULEB128(8);
-
-    OutStreamer.AddComment("DW_OP_regx for Q register: D2");
-    EmitInt8(dwarf::DW_OP_regx);
-    EmitULEB128(D2);
-    OutStreamer.AddComment("DW_OP_piece 8");
-    EmitInt8(dwarf::DW_OP_piece);
-    EmitULEB128(8);
-  }
-}
-
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
   // of the function.
@@ -146,9 +81,9 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
   assert(GV && "C++ constructor pointer was not a GlobalValue!");
 
   const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV),
-                                            (Subtarget->isTargetDarwin()
-                                             ? MCSymbolRefExpr::VK_None
-                                             : MCSymbolRefExpr::VK_ARM_TARGET1),
+                                            (Subtarget->isTargetELF()
+                                             ? MCSymbolRefExpr::VK_ARM_TARGET1
+                                             : MCSymbolRefExpr::VK_None),
                                             OutContext);
   
   OutStreamer.EmitValue(E, Size);
@@ -213,18 +148,9 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
       O << "(PLT)";
     break;
   }
-  case MachineOperand::MO_ExternalSymbol: {
-    O << *GetExternalSymbolSymbol(MO.getSymbolName());
-    if (TF == ARMII::MO_PLT)
-      O << "(PLT)";
-    break;
-  }
   case MachineOperand::MO_ConstantPoolIndex:
     O << *GetCPISymbol(MO.getIndex());
     break;
-  case MachineOperand::MO_JumpTableIndex:
-    O << *GetJTISymbol(MO.getIndex());
-    break;
   }
 }
 
@@ -232,16 +158,18 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
 
 MCSymbol *ARMAsmPrinter::
 GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
+  const DataLayout *DL = TM.getDataLayout();
   SmallString<60> Name;
-  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI"
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
     << getFunctionNumber() << '_' << uid << '_' << uid2;
   return OutContext.GetOrCreateSymbol(Name.str());
 }
 
 
 MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
+  const DataLayout *DL = TM.getDataLayout();
   SmallString<60> Name;
-  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "SJLJEH"
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH"
     << getFunctionNumber();
   return OutContext.GetOrCreateSymbol(Name.str());
 }
@@ -446,8 +374,22 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+static bool isThumb(const MCSubtargetInfo& STI) {
+  return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+}
+
+void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                     const MCSubtargetInfo *EndInfo) const {
+  // If either end mode is unknown (EndInfo == NULL) or different than
+  // the start mode, then restore the start mode.
+  const bool WasThumb = isThumb(StartInfo);
+  if (EndInfo == NULL || WasThumb != isThumb(*EndInfo)) {
+    OutStreamer.EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
+  }
+}
+
 void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget->isTargetMachO()) {
     Reloc::Model RelocM = TM.getRelocationModel();
     if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
       // Declare all the text sections up front (before the DWARF sections
@@ -468,7 +410,7 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
       // Now any user defined text sections from function attributes.
       for (Module::iterator F = M.begin(), e = M.end(); F != e; ++F)
         if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage())
-          TextSections.insert(TLOFMacho.SectionForGlobal(F, Mang, TM));
+          TextSections.insert(TLOFMacho.SectionForGlobal(F, *Mang, TM));
       // Now the coalescable sections.
       TextSections.insert(TLOFMacho.getTextCoalSection());
       TextSections.insert(TLOFMacho.getConstTextCoalSection());
@@ -480,23 +422,30 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
       if (RelocM == Reloc::DynamicNoPIC) {
         const MCSection *sect =
           OutContext.getMachOSection("__TEXT", "__symbol_stub4",
-                                     MCSectionMachO::S_SYMBOL_STUBS,
+                                     MachO::S_SYMBOL_STUBS,
                                      12, SectionKind::getText());
         OutStreamer.SwitchSection(sect);
       } else {
         const MCSection *sect =
           OutContext.getMachOSection("__TEXT", "__picsymbolstub4",
-                                     MCSectionMachO::S_SYMBOL_STUBS,
+                                     MachO::S_SYMBOL_STUBS,
                                      16, SectionKind::getText());
         OutStreamer.SwitchSection(sect);
       }
       const MCSection *StaticInitSect =
         OutContext.getMachOSection("__TEXT", "__StaticInit",
-                                   MCSectionMachO::S_REGULAR |
-                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   MachO::S_REGULAR |
+                                   MachO::S_ATTR_PURE_INSTRUCTIONS,
                                    SectionKind::getText());
       OutStreamer.SwitchSection(StaticInitSect);
     }
+
+    // Compiling with debug info should not affect the code
+    // generation.  Ensure the cstring section comes before the
+    // optional __DWARF secion. Otherwise, PC-relative loads would
+    // have to use different instruction sequences at "-g" in order to
+    // reach global data in the same object file.
+    OutStreamer.SwitchSection(getObjFileLowering().getCStringSection());
   }
 
   // Use unified assembler syntax.
@@ -509,7 +458,7 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 
 void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget->isTargetMachO()) {
     // All darwin targets use mach-o.
     const TargetLoweringObjectFileMachO &TLOFMacho =
       static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
@@ -611,28 +560,33 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
 }
 
 void ARMAsmPrinter::emitAttributes() {
-  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  MCTargetStreamer &TS = *OutStreamer.getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   ATS.switchVendor("aeabi");
 
   std::string CPUString = Subtarget->getCPUString();
 
-  if (CPUString != "generic")
+  // FIXME: remove krait check when GNU tools support krait cpu
+  if (CPUString != "generic" && CPUString != "krait")
     ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
 
   ATS.emitAttribute(ARMBuildAttrs::CPU_arch,
                     getArchForCPU(CPUString, Subtarget));
 
-  if (Subtarget->isAClass()) {
-    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                      ARMBuildAttrs::ApplicationProfile);
-  } else if (Subtarget->isRClass()) {
-    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                      ARMBuildAttrs::RealTimeProfile);
-  } else if (Subtarget->isMClass()){
-    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                      ARMBuildAttrs::MicroControllerProfile);
+  // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
+  // profile is not applicable (e.g. pre v7, or cross-profile code)". 
+  if (Subtarget->hasV7Ops()) {
+    if (Subtarget->isAClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::ApplicationProfile);
+    } else if (Subtarget->isRClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::RealTimeProfile);
+    } else if (Subtarget->isMClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::MicroControllerProfile);
+    }
   }
 
   ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ?
@@ -687,10 +641,10 @@ void ARMAsmPrinter::emitAttributes() {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
                       ARMBuildAttrs::AllowIEE754);
 
-  // FIXME: add more flags to ARMBuildAttrs.h
+  // FIXME: add more flags to ARMBuildAttributes.h
   // 8-bytes alignment stuff.
-  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
-  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
 
   // ABI_HardFP_use attribute to indicate single precision FP.
   if (Subtarget->isFPOnlySP())
@@ -709,12 +663,14 @@ void ARMAsmPrinter::emitAttributes() {
   if (Subtarget->hasMPExtension())
       ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
 
-  if (Subtarget->hasDivide()) {
-    // Check if hardware divide is only available in thumb2 or ARM as well.
-    ATS.emitAttribute(ARMBuildAttrs::DIV_use,
-      Subtarget->hasDivideInARMMode() ? ARMBuildAttrs::AllowDIVExt :
-                                        ARMBuildAttrs::AllowDIVIfExists);
-  }
+  // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
+  // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
+  // It is not possible to produce DisallowDIV: if hwdiv is present in the base
+  // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
+  // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
+  // otherwise, the default value (AllowDIVIfExists) applies.
+  if (Subtarget->hasDivideInARMMode() && !Subtarget->hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
 
   if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
       ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
@@ -729,28 +685,6 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.finishAttributeSection();
 }
 
-void ARMAsmPrinter::emitARMAttributeSection() {
-  // <format-version>
-  // [ <section-length> "vendor-name"
-  // [ <file-tag> <size> <attribute>*
-  //   | <section-tag> <size> <section-number>* 0 <attribute>*
-  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
-  //   ]+
-  // ]*
-
-  if (OutStreamer.hasRawTextSupport())
-    return;
-
-  const ARMElfTargetObjectFile &TLOFELF =
-    static_cast<const ARMElfTargetObjectFile &>
-    (getObjFileLowering());
-
-  OutStreamer.SwitchSection(TLOFELF.getAttributesSection());
-
-  // Format version
-  OutStreamer.EmitIntValue(0x41, 1);
-}
-
 //===----------------------------------------------------------------------===//
 
 static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber,
@@ -765,23 +699,25 @@ static MCSymbolRefExpr::VariantKind
 getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
   switch (Modifier) {
   case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None;
-  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_ARM_TLSGD;
-  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_ARM_TPOFF;
-  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_ARM_GOTTPOFF;
-  case ARMCP::GOT:         return MCSymbolRefExpr::VK_ARM_GOT;
-  case ARMCP::GOTOFF:      return MCSymbolRefExpr::VK_ARM_GOTOFF;
+  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_TLSGD;
+  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_TPOFF;
+  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_GOTTPOFF;
+  case ARMCP::GOT:         return MCSymbolRefExpr::VK_GOT;
+  case ARMCP::GOTOFF:      return MCSymbolRefExpr::VK_GOTOFF;
   }
   llvm_unreachable("Invalid ARMCPModifier!");
 }
 
-MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
-  bool isIndirect = Subtarget->isTargetDarwin() &&
+MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
+                                        unsigned char TargetFlags) {
+  bool isIndirect = Subtarget->isTargetMachO() &&
+    (TargetFlags & ARMII::MO_NONLAZY) &&
     Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
   if (!isIndirect)
     return getSymbol(GV);
 
   // FIXME: Remove this when Darwin transition to @GOT like syntax.
-  MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+  MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
   MachineModuleInfoMachO &MMIMachO =
     MMI->getObjFileInfo<MachineModuleInfoMachO>();
   MachineModuleInfoImpl::StubValueTy &StubSym =
@@ -795,6 +731,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
 
 void ARMAsmPrinter::
 EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  const DataLayout *DL = TM.getDataLayout();
   int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
 
   ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
@@ -803,7 +740,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
   if (ACPV->isLSDA()) {
     SmallString<128> Str;
     raw_svector_ostream OS(Str);
-    OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
+    OS << DL->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
     MCSym = OutContext.GetOrCreateSymbol(OS.str());
   } else if (ACPV->isBlockAddress()) {
     const BlockAddress *BA =
@@ -811,7 +748,11 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     MCSym = GetBlockAddressSymbol(BA);
   } else if (ACPV->isGlobalValue()) {
     const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
-    MCSym = GetARMGVSymbol(GV);
+
+    // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so
+    // flag the global as MO_NONLAZY.
+    unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0;
+    MCSym = GetARMGVSymbol(GV, TF);
   } else if (ACPV->isMachineBasicBlock()) {
     const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
     MCSym = MBB->getSymbol();
@@ -827,7 +768,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
                             OutContext);
 
   if (ACPV->getPCAdjustment()) {
-    MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(),
+    MCSymbol *PCLabel = getPICLabel(DL->getPrivateGlobalPrefix(),
                                     getFunctionNumber(),
                                     ACPV->getLabelId(),
                                     OutContext);
@@ -932,7 +873,7 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
                                                       OutContext);
     // If this isn't a TBB or TBH, the entries are direct branch instructions.
     if (OffsetWidth == 4) {
-      OutStreamer.EmitInstruction(MCInstBuilder(ARM::t2B)
+      EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2B)
         .addExpr(MBBSymbolExpr)
         .addImm(ARMCC::AL)
         .addReg(0));
@@ -966,7 +907,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   assert(MI->getFlag(MachineInstr::FrameSetup) &&
       "Only instruction which are involved into frame setup code are allowed");
 
-  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  MCTargetStreamer &TS = *OutStreamer.getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   const MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
@@ -1084,11 +1025,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         // instruction.
         ATS.emitPad(Offset);
       } else {
-        MI->dump();
-        llvm_unreachable("Unsupported opcode for unwinding information");
+        // Move of SP to a register.  Positive values correspond to an "add"
+        // instruction.
+        ATS.emitMovSP(DstReg, -Offset);
       }
     } else if (DstReg == ARM::SP) {
-      // FIXME: .movsp goes here
       MI->dump();
       llvm_unreachable("Unsupported opcode for unwinding information");
     }
@@ -1099,13 +1040,13 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   }
 }
 
-extern cl::opt<bool> EnableARMEHABI;
-
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  const DataLayout *DL = TM.getDataLayout();
+
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
     OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
@@ -1113,7 +1054,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   // Emit unwinding stuff for frame-related instructions
-  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
+  if (Subtarget->isTargetEHABICompatible() &&
+       MI->getFlag(MachineInstr::FrameSetup))
     EmitUnwindingInstruction(MI);
 
   // Do any auto-generated pseudo lowerings.
@@ -1133,7 +1075,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::t2LEApcrel: {
     // FIXME: Need to also handle globals and externals
     MCSymbol *CPISymbol = GetCPISymbol(MI->getOperand(1).getIndex());
-    OutStreamer.EmitInstruction(MCInstBuilder(MI->getOpcode() ==
+    EmitToStreamer(OutStreamer, MCInstBuilder(MI->getOpcode() ==
                                               ARM::t2LEApcrel ? ARM::t2ADR
                   : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
                      : ARM::ADR))
@@ -1150,7 +1092,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *JTIPICSymbol =
       GetARMJTIPICJumpTableLabel2(MI->getOperand(1).getIndex(),
                                   MI->getOperand(2).getImm());
-    OutStreamer.EmitInstruction(MCInstBuilder(MI->getOpcode() ==
+    EmitToStreamer(OutStreamer, MCInstBuilder(MI->getOpcode() ==
                                               ARM::t2LEApcrelJT ? ARM::t2ADR
                   : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
                      : ARM::ADR))
@@ -1164,7 +1106,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // Darwin call instructions are just normal call instructions with different
   // clobber semantics (they clobber R9).
   case ARM::BX_CALL: {
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
       .addReg(ARM::LR)
       .addReg(ARM::PC)
       // Add predicate operands.
@@ -1173,19 +1115,19 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // Add 's' bit operand (always reg0 for this)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::BX)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::BX)
       .addReg(MI->getOperand(0).getReg()));
     return;
   }
   case ARM::tBX_CALL: {
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
       .addReg(ARM::LR)
       .addReg(ARM::PC)
       // Add predicate operands.
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tBX)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBX)
       .addReg(MI->getOperand(0).getReg())
       // Add predicate operands.
       .addImm(ARMCC::AL)
@@ -1193,7 +1135,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::BMOVPCRX_CALL: {
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
       .addReg(ARM::LR)
       .addReg(ARM::PC)
       // Add predicate operands.
@@ -1202,7 +1144,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // Add 's' bit operand (always reg0 for this)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
       .addReg(ARM::PC)
       .addReg(MI->getOperand(0).getReg())
       // Add predicate operands.
@@ -1213,7 +1155,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::BMOVPCB_CALL: {
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVr)
       .addReg(ARM::LR)
       .addReg(ARM::PC)
       // Add predicate operands.
@@ -1225,7 +1167,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GV = MI->getOperand(0).getGlobal();
     MCSymbol *GVSym = getSymbol(GV);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::Bcc)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::Bcc)
       .addExpr(GVSymExpr)
       // Add predicate operands.
       .addImm(ARMCC::AL)
@@ -1239,33 +1181,28 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
 
     unsigned TF = MI->getOperand(1).getTargetFlags();
-    bool isPIC = TF == ARMII::MO_LO16_NONLAZY_PIC;
     const GlobalValue *GV = MI->getOperand(1).getGlobal();
-    MCSymbol *GVSym = GetARMGVSymbol(GV);
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
-    if (isPIC) {
-      MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(),
-                                       getFunctionNumber(),
-                                       MI->getOperand(2).getImm(), OutContext);
-      const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
-      unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
-      const MCExpr *PCRelExpr =
-        ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr,
-                                  MCBinaryExpr::CreateAdd(LabelSymExpr,
+
+    MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
+                                     getFunctionNumber(),
+                                     MI->getOperand(2).getImm(), OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+    unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
+    const MCExpr *PCRelExpr =
+      ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                      MCBinaryExpr::CreateAdd(LabelSymExpr,
                                       MCConstantExpr::Create(PCAdj, OutContext),
-                                          OutContext), OutContext), OutContext);
+                                      OutContext), OutContext), OutContext);
       TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
-    } else {
-      const MCExpr *RefExpr= ARMMCExpr::CreateLower16(GVSymExpr, OutContext);
-      TmpInst.addOperand(MCOperand::CreateExpr(RefExpr));
-    }
 
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
     TmpInst.addOperand(MCOperand::CreateReg(0));
     // Add 's' bit operand (always reg0 for this)
     TmpInst.addOperand(MCOperand::CreateReg(0));
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
   case ARM::MOVTi16_ga_pcrel:
@@ -1277,32 +1214,27 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
 
     unsigned TF = MI->getOperand(2).getTargetFlags();
-    bool isPIC = TF == ARMII::MO_HI16_NONLAZY_PIC;
     const GlobalValue *GV = MI->getOperand(2).getGlobal();
-    MCSymbol *GVSym = GetARMGVSymbol(GV);
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
-    if (isPIC) {
-      MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(),
-                                       getFunctionNumber(),
-                                       MI->getOperand(3).getImm(), OutContext);
-      const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
-      unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
-      const MCExpr *PCRelExpr =
+
+    MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
+                                     getFunctionNumber(),
+                                     MI->getOperand(3).getImm(), OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+    unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
+    const MCExpr *PCRelExpr =
         ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr,
                                    MCBinaryExpr::CreateAdd(LabelSymExpr,
                                       MCConstantExpr::Create(PCAdj, OutContext),
                                           OutContext), OutContext), OutContext);
       TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
-    } else {
-      const MCExpr *RefExpr= ARMMCExpr::CreateUpper16(GVSymExpr, OutContext);
-      TmpInst.addOperand(MCOperand::CreateExpr(RefExpr));
-    }
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
     TmpInst.addOperand(MCOperand::CreateReg(0));
     // Add 's' bit operand (always reg0 for this)
     TmpInst.addOperand(MCOperand::CreateReg(0));
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
   case ARM::tPICADD: {
@@ -1312,12 +1244,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+    OutStreamer.EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
                           getFunctionNumber(), MI->getOperand(2).getImm(),
                           OutContext));
 
     // Form and emit the add.
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tADDhirr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tADDhirr)
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(0).getReg())
       .addReg(ARM::PC)
@@ -1333,12 +1265,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+    OutStreamer.EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
                           getFunctionNumber(), MI->getOperand(2).getImm(),
                           OutContext));
 
     // Form and emit the add.
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDrr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDrr)
       .addReg(MI->getOperand(0).getReg())
       .addReg(ARM::PC)
       .addReg(MI->getOperand(1).getReg())
@@ -1364,7 +1296,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // a PC-relative address at the ldr instruction.
 
     // Emit the label.
-    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+    OutStreamer.EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(),
                           getFunctionNumber(), MI->getOperand(2).getImm(),
                           OutContext));
 
@@ -1382,7 +1314,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     case ARM::PICLDRSB: Opcode = ARM::LDRSB; break;
     case ARM::PICLDRSH: Opcode = ARM::LDRSH; break;
     }
-    OutStreamer.EmitInstruction(MCInstBuilder(Opcode)
+    EmitToStreamer(OutStreamer, MCInstBuilder(Opcode)
       .addReg(MI->getOperand(0).getReg())
       .addReg(ARM::PC)
       .addReg(MI->getOperand(1).getReg())
@@ -1419,7 +1351,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case ARM::t2BR_JT: {
     // Lower and emit the instruction itself, then the jump table following it.
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
       .addReg(ARM::PC)
       .addReg(MI->getOperand(0).getReg())
       // Add predicate operands.
@@ -1432,7 +1364,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case ARM::t2TBB_JT: {
     // Lower and emit the instruction itself, then the jump table following it.
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::t2TBB)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2TBB)
       .addReg(ARM::PC)
       .addReg(MI->getOperand(0).getReg())
       // Add predicate operands.
@@ -1447,7 +1379,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case ARM::t2TBH_JT: {
     // Lower and emit the instruction itself, then the jump table following it.
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::t2TBH)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2TBH)
       .addReg(ARM::PC)
       .addReg(MI->getOperand(0).getReg())
       // Add predicate operands.
@@ -1474,7 +1406,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Add 's' bit operand (always reg0 for this)
     if (Opc == ARM::MOVr)
       TmpInst.addOperand(MCOperand::CreateReg(0));
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
 
     // Make sure the Thumb jump table is 4-byte aligned.
     if (Opc == ARM::tMOVr)
@@ -1504,7 +1436,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
     TmpInst.addOperand(MCOperand::CreateReg(0));
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
 
     // Output the data for the jump table itself
     EmitJumpTable(MI);
@@ -1513,7 +1445,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::BR_JTadd: {
     // Lower and emit the instruction itself, then the jump table following it.
     // add pc, target, idx
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDrr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDrr)
       .addReg(ARM::PC)
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
@@ -1530,7 +1462,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::TRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
-    if (!Subtarget->isTargetDarwin()) {
+    if (!Subtarget->isTargetMachO()) {
       //.long 0xe7ffdefe @ trap
       uint32_t Val = 0xe7ffdefeUL;
       OutStreamer.AddComment("trap");
@@ -1549,7 +1481,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::tTRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
-    if (!Subtarget->isTargetDarwin()) {
+    if (!Subtarget->isTargetMachO()) {
       //.short 57086 @ trap
       uint16_t Val = 0xdefe;
       OutStreamer.AddComment("trap");
@@ -1573,14 +1505,14 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned ValReg = MI->getOperand(1).getReg();
     MCSymbol *Label = GetARMSJLJEHLabel();
     OutStreamer.AddComment("eh_setjmp begin");
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
       .addReg(ValReg)
       .addReg(ARM::PC)
       // Predicate.
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tADDi3)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tADDi3)
       .addReg(ValReg)
       // 's' bit operand
       .addReg(ARM::CPSR)
@@ -1590,7 +1522,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tSTRi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tSTRi)
       .addReg(ValReg)
       .addReg(SrcReg)
       // The offset immediate is #4. The operand value is scaled by 4 for the
@@ -1600,7 +1532,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVi8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVi8)
       .addReg(ARM::R0)
       .addReg(ARM::CPSR)
       .addImm(0)
@@ -1609,13 +1541,13 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
 
     const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tB)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tB)
       .addExpr(SymbolExpr)
       .addImm(ARMCC::AL)
       .addReg(0));
 
     OutStreamer.AddComment("eh_setjmp end");
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVi8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVi8)
       .addReg(ARM::R0)
       .addReg(ARM::CPSR)
       .addImm(1)
@@ -1639,7 +1571,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned ValReg = MI->getOperand(1).getReg();
 
     OutStreamer.AddComment("eh_setjmp begin");
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDri)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDri)
       .addReg(ValReg)
       .addReg(ARM::PC)
       .addImm(8)
@@ -1649,7 +1581,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // 's' bit operand (always reg0 for this).
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::STRi12)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::STRi12)
       .addReg(ValReg)
       .addReg(SrcReg)
       .addImm(4)
@@ -1657,7 +1589,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVi)
       .addReg(ARM::R0)
       .addImm(0)
       // Predicate.
@@ -1666,7 +1598,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // 's' bit operand (always reg0 for this).
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDri)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::ADDri)
       .addReg(ARM::PC)
       .addReg(ARM::PC)
       .addImm(0)
@@ -1677,7 +1609,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
 
     OutStreamer.AddComment("eh_setjmp end");
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::MOVi)
       .addReg(ARM::R0)
       .addImm(1)
       // Predicate.
@@ -1694,7 +1626,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // bx $scratch
     unsigned SrcReg = MI->getOperand(0).getReg();
     unsigned ScratchReg = MI->getOperand(1).getReg();
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::LDRi12)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::LDRi12)
       .addReg(ARM::SP)
       .addReg(SrcReg)
       .addImm(8)
@@ -1702,7 +1634,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::LDRi12)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::LDRi12)
       .addReg(ScratchReg)
       .addReg(SrcReg)
       .addImm(4)
@@ -1710,7 +1642,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::LDRi12)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::LDRi12)
       .addReg(ARM::R7)
       .addReg(SrcReg)
       .addImm(0)
@@ -1718,7 +1650,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::BX)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::BX)
       .addReg(ScratchReg)
       // Predicate.
       .addImm(ARMCC::AL)
@@ -1733,7 +1665,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // bx $scratch
     unsigned SrcReg = MI->getOperand(0).getReg();
     unsigned ScratchReg = MI->getOperand(1).getReg();
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tLDRi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
       .addReg(SrcReg)
       // The offset immediate is #8. The operand value is scaled by 4 for the
@@ -1743,14 +1675,14 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tMOVr)
       .addReg(ARM::SP)
       .addReg(ScratchReg)
       // Predicate.
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tLDRi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
       .addReg(SrcReg)
       .addImm(1)
@@ -1758,7 +1690,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tLDRi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ARM::R7)
       .addReg(SrcReg)
       .addImm(0)
@@ -1766,7 +1698,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tBX)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM::tBX)
       .addReg(ScratchReg)
       // Predicate.
       .addImm(ARMCC::AL)
@@ -1778,7 +1710,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
 
-  OutStreamer.EmitInstruction(TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1787,6 +1719,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmPrinter() {
-  RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
-  RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+  RegisterAsmPrinter<ARMAsmPrinter> X(TheARMLETarget);
+  RegisterAsmPrinter<ARMAsmPrinter> Y(TheARMBETarget);
+  RegisterAsmPrinter<ARMAsmPrinter> A(TheThumbLETarget);
+  RegisterAsmPrinter<ARMAsmPrinter> B(TheThumbBETarget);
 }
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index de72e06..46c2626 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -10,14 +10,16 @@
 #ifndef ARMASMPRINTER_H
 #define ARMASMPRINTER_H
 
-#include "ARM.h"
-#include "ARMTargetMachine.h"
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
+class ARMFunctionInfo;
 class MCOperand;
+class MachineConstantPool;
+class MachineOperand;
 
 namespace ARM {
   enum DW_ISA {
@@ -49,33 +51,36 @@ public:
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
 
-  virtual const char *getPassName() const LLVM_OVERRIDE {
+  const char *getPassName() const override {
     return "ARM Assembly / Object Emitter";
   }
 
   void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
                     const char *Modifier = 0);
 
-  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                               unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &O) LLVM_OVERRIDE;
-  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                                     unsigned AsmVariant, const char *ExtraCode,
-                                     raw_ostream &O) LLVM_OVERRIDE;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                        const MCSubtargetInfo *EndInfo) const override;
 
   void EmitJumpTable(const MachineInstr *MI);
   void EmitJump2Table(const MachineInstr *MI);
-  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
-  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
+  void EmitInstruction(const MachineInstr *MI) override;
+  bool runOnMachineFunction(MachineFunction &F) override;
 
-  virtual void EmitConstantPool() LLVM_OVERRIDE {
+  void EmitConstantPool() override {
     // we emit constant pools customly!
   }
-  virtual void EmitFunctionBodyEnd() LLVM_OVERRIDE;
-  virtual void EmitFunctionEntryLabel() LLVM_OVERRIDE;
-  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
-  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
-  virtual void EmitXXStructor(const Constant *CV) LLVM_OVERRIDE;
+  void EmitFunctionBodyEnd() override;
+  void EmitFunctionEntryLabel() override;
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitXXStructor(const Constant *CV) override;
 
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
@@ -84,9 +89,6 @@ private:
   // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
   void emitAttributes();
 
-  // Helper for ELF .o only
-  void emitARMAttributeSection();
-
   // Generic helper used to emit e.g. ARMv5 mul pseudos
   void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
 
@@ -97,13 +99,9 @@ private:
                                    const MachineInstr *MI);
 
 public:
-  /// EmitDwarfRegOp - Emit dwarf register operation.
-  virtual void EmitDwarfRegOp(const MachineLocation &MLoc, bool Indirect) const
-      LLVM_OVERRIDE;
-
-  virtual unsigned getISAEncoding() LLVM_OVERRIDE {
+  unsigned getISAEncoding() override {
     // ARM/Darwin adds ISA to the DWARF info for each function.
-    if (!Subtarget->isTargetDarwin())
+    if (!Subtarget->isTargetMachO())
       return 0;
     return Subtarget->isThumb() ?
       ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
@@ -115,13 +113,12 @@ private:
 
   MCSymbol *GetARMSJLJEHLabel() const;
 
-  MCSymbol *GetARMGVSymbol(const GlobalValue *GV);
+  MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
 
 public:
   /// EmitMachineConstantPoolValue - Print a machine constantpool value to
   /// the .s file.
-  virtual void
-    EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) LLVM_OVERRIDE;
+  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARMAtomicExpandPass.cpp b/lib/Target/ARM/ARMAtomicExpandPass.cpp
new file mode 100644
index 0000000..18e0783
--- /dev/null
+++ b/lib/Target/ARM/ARMAtomicExpandPass.cpp
@@ -0,0 +1,406 @@
+//===-- ARMAtomicExpandPass.cpp - Expand atomic instructions --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions with
+// appropriate (intrinsic-based) ldrex/strex loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-atomic-expand"
+#include "ARM.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  class ARMAtomicExpandPass : public FunctionPass {
+    const TargetLowering *TLI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit ARMAtomicExpandPass(const TargetMachine *TM = 0)
+      : FunctionPass(ID), TLI(TM->getTargetLowering()) {}
+
+    bool runOnFunction(Function &F) override;
+    bool expandAtomicInsts(Function &F);
+
+    bool expandAtomicLoad(LoadInst *LI);
+    bool expandAtomicStore(StoreInst *LI);
+    bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+
+    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
+    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
+
+    /// Perform a load-linked operation on Addr, returning a "Value *" with the
+    /// corresponding pointee type. This may entail some non-trivial operations
+    /// to truncate or reconstruct illegal types since intrinsics must be legal
+    Value *loadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord);
+
+    /// Perform a store-conditional operation to Addr. Return the status of the
+    /// store: 0 if the it succeeded, non-zero otherwise.
+    Value *storeConditional(IRBuilder<> &Builder, Value *Val, Value *Addr,
+                            AtomicOrdering Ord);
+
+    /// Return true if the given (atomic) instruction should be expanded by this
+    /// pass.
+    bool shouldExpandAtomic(Instruction *Inst);
+  };
+}
+
+char ARMAtomicExpandPass::ID = 0;
+
+FunctionPass *llvm::createARMAtomicExpandPass(const TargetMachine *TM) {
+  return new ARMAtomicExpandPass(TM);
+}
+
+bool ARMAtomicExpandPass::runOnFunction(Function &F) {
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (BasicBlock &BB : F)
+    for (Instruction &Inst : BB) {
+      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
+          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
+          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
+        AtomicInsts.push_back(&Inst);
+    }
+
+  bool MadeChange = false;
+  for (Instruction *Inst : AtomicInsts) {
+    if (!shouldExpandAtomic(Inst))
+      continue;
+
+    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+      MadeChange |= expandAtomicRMW(AI);
+    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
+      MadeChange |= expandAtomicCmpXchg(CI);
+    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      MadeChange |= expandAtomicLoad(LI);
+    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      MadeChange |= expandAtomicStore(SI);
+    else
+      llvm_unreachable("Unknown atomic instruction");
+  }
+
+  return MadeChange;
+}
+
+bool ARMAtomicExpandPass::expandAtomicLoad(LoadInst *LI) {
+  // Load instructions don't actually need a leading fence, even in the
+  // SequentiallyConsistent case.
+  AtomicOrdering MemOpOrder =
+    TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering();
+
+  // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
+  // an ldrexd (A3.5.3).
+  IRBuilder<> Builder(LI);
+  Value *Val = loadLinked(Builder, LI->getPointerOperand(), MemOpOrder);
+
+  insertTrailingFence(Builder, LI->getOrdering());
+
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool ARMAtomicExpandPass::expandAtomicStore(StoreInst *SI) {
+  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
+  // we need a loop and the entire instruction is essentially an "atomicrmw
+  // xchg" that ignores the value loaded.
+  IRBuilder<> Builder(SI);
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), SI->getOrdering());
+  SI->eraseFromParent();
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  return expandAtomicRMW(AI);
+}
+
+bool ARMAtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
+  AtomicOrdering Order = AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     fence?
+  // atomicrmw.start:
+  //     %loaded = @load.linked(%addr)
+  //     %new = some_op iN %loaded, %incr
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %atomicrmw.end
+  // atomicrmw.end:
+  //     fence?
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = loadLinked(Builder, Addr, MemOpOrder);
+
+  Value *NewVal;
+  switch (AI->getOperation()) {
+  case AtomicRMWInst::Xchg:
+    NewVal = AI->getValOperand();
+    break;
+  case AtomicRMWInst::Add:
+    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Sub:
+    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::And:
+    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Nand:
+    NewVal = Builder.CreateAnd(Loaded, Builder.CreateNot(AI->getValOperand()),
+                               "new");
+    break;
+  case AtomicRMWInst::Or:
+    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Xor:
+    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+
+  Value *StoreSuccess = storeConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
+  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  insertTrailingFence(Builder, Order);
+
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
+bool ARMAtomicExpandPass::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
+  AtomicOrdering FailureOrder = CI->getFailureOrdering();
+  Value *Addr = CI->getPointerOperand();
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
+  //
+  // The full expansion we produce is:
+  //     [...]
+  //     fence?
+  // cmpxchg.start:
+  //     %loaded = @load.linked(%addr)
+  //     %should_store = icmp eq %loaded, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.end/%cmpxchg.barrier
+  // cmpxchg.trystore:
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %cmpxchg.end
+  // cmpxchg.barrier:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
+  auto BarrierBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ExitBB);
+  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.barrier", F, BarrierBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
+
+  // This grabs the DebugLoc from CI
+  IRBuilder<> Builder(CI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = loadLinked(Builder, Addr, MemOpOrder);
+  Value *ShouldStore =
+      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
+
+  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
+  // jump straight past that fence instruction (if it exists).
+  BasicBlock *FailureBB = FailureOrder == Monotonic ? ExitBB : BarrierBB;
+  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
+
+  Builder.SetInsertPoint(TryStoreBB);
+  Value *StoreSuccess =
+      storeConditional(Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
+
+  // Finally, make sure later instructions don't get reordered with a fence if
+  // necessary.
+  Builder.SetInsertPoint(BarrierBB);
+  insertTrailingFence(Builder, SuccessOrder);
+  Builder.CreateBr(ExitBB);
+
+  CI->replaceAllUsesWith(Loaded);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+Value *ARMAtomicExpandPass::loadLinked(IRBuilder<> &Builder, Value *Addr,
+                                          AtomicOrdering Ord) {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i32, i32} and we have to recombine them into a
+  // single i64 here.
+  if (ValTy->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
+    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
+  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldrex, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *ARMAtomicExpandPass::storeConditional(IRBuilder<> &Builder, Value *Val,
+                                           Value *Addr, AtomicOrdering Ord) {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i64 intrinsics take two
+  // parameters: "i32, i32". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
+    Function *Strex = Intrinsic::getDeclaration(M, Int);
+    Type *Int32Ty = Type::getInt32Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
+  Type *Tys[] = { Addr->getType() };
+  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Strex, Builder.CreateZExtOrBitCast(
+                 Val, Strex->getFunctionType()->getParamType(0)),
+      Addr);
+}
+
+AtomicOrdering ARMAtomicExpandPass::insertLeadingFence(IRBuilder<> &Builder,
+                                                       AtomicOrdering Ord) {
+  if (!TLI->getInsertFencesForAtomic())
+    return Ord;
+
+  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    Builder.CreateFence(Release);
+
+  // The exclusive operations don't need any barrier if we're adding separate
+  // fences.
+  return Monotonic;
+}
+
+void ARMAtomicExpandPass::insertTrailingFence(IRBuilder<> &Builder,
+                                              AtomicOrdering Ord) {
+  if (!TLI->getInsertFencesForAtomic())
+    return;
+
+  if (Ord == Acquire || Ord == AcquireRelease)
+    Builder.CreateFence(Acquire);
+  else if (Ord == SequentiallyConsistent)
+    Builder.CreateFence(SequentiallyConsistent);
+}
+
+bool ARMAtomicExpandPass::shouldExpandAtomic(Instruction *Inst) {
+  // Loads and stores less than 64-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 64;
+
+  // For the real atomic operations, we have ldrex/strex up to 64 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index f835a4e..47f5bf9 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -283,7 +283,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // Walk backwards from the end of the basic block until the branch is
   // analyzed or we give up.
-  while (isPredicated(I) || I->isTerminator()) {
+  while (isPredicated(I) || I->isTerminator() || I->isDebugValue()) {
 
     // Flag to be raised on unanalyzeable instructions. This is useful in cases
     // where we want to clean up on the end of the basic block before we bail
@@ -336,7 +336,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       // If we can modify the function, delete everything below this
       // unconditional branch.
       if (AllowModify) {
-        MachineBasicBlock::iterator DI = llvm::next(I);
+        MachineBasicBlock::iterator DI = std::next(I);
         while (DI != MBB.end()) {
           MachineInstr *InstToDelete = DI;
           ++DI;
@@ -535,6 +535,20 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
+template<> bool IsCPSRDead<MachineInstr>(MachineInstr* MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  // all definitions of CPSR are dead
+  return true;
+}
+
 /// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
 LLVM_ATTRIBUTE_NOINLINE
 static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
@@ -559,15 +573,10 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   // If this machine instr is an inline asm, measure it.
   if (MI->getOpcode() == ARM::INLINEASM)
     return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
-  if (MI->isLabel())
-    return 0;
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-  case TargetOpcode::PROLOG_LABEL:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::DBG_VALUE:
+  default:
+    // pseudo-instruction sizes are zero.
     return 0;
   case TargetOpcode::BUNDLE:
     return getInstBundleLength(MI);
@@ -630,9 +639,6 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       ++NumEntries;
     return NumEntries * EntrySize + InstSize;
   }
-  default:
-    // Otherwise, pseudo-instruction sizes are zero.
-    return 0;
   }
 }
 
@@ -1243,6 +1249,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
 
   unsigned PCLabelId = AFI->createPICLabelUId();
   ARMConstantPoolValue *NewCPV = 0;
+
   // FIXME: The below assumes PIC relocation model and that the function
   // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
   // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR
@@ -1325,10 +1332,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
       Opcode == ARM::t2LDRpci_pic ||
       Opcode == ARM::tLDRpci ||
       Opcode == ARM::tLDRpci_pic ||
-      Opcode == ARM::MOV_ga_dyn ||
+      Opcode == ARM::LDRLIT_ga_pcrel ||
+      Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+      Opcode == ARM::tLDRLIT_ga_pcrel ||
       Opcode == ARM::MOV_ga_pcrel ||
       Opcode == ARM::MOV_ga_pcrel_ldr ||
-      Opcode == ARM::t2MOV_ga_dyn ||
       Opcode == ARM::t2MOV_ga_pcrel) {
     if (MI1->getOpcode() != Opcode)
       return false;
@@ -1340,10 +1348,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
     if (MO0.getOffset() != MO1.getOffset())
       return false;
 
-    if (Opcode == ARM::MOV_ga_dyn ||
+    if (Opcode == ARM::LDRLIT_ga_pcrel ||
+        Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+        Opcode == ARM::tLDRLIT_ga_pcrel ||
         Opcode == ARM::MOV_ga_pcrel ||
         Opcode == ARM::MOV_ga_pcrel_ldr ||
-        Opcode == ARM::t2MOV_ga_dyn ||
         Opcode == ARM::t2MOV_ga_pcrel)
       // Ignore the PC labels.
       return MO0.getGlobal() == MO1.getGlobal();
@@ -1534,7 +1543,7 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
     return false;
 
   // Terminators and labels can't be scheduled around.
-  if (MI->isTerminator() || MI->isLabel())
+  if (MI->isTerminator() || MI->isPosition())
     return true;
 
   // Treat the start of the IT block as a scheduling boundary, but schedule
@@ -1857,12 +1866,21 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
   }
 }
 
-bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
-                                      MachineInstr *MI,
+static bool isAnySubRegLive(unsigned Reg, const TargetRegisterInfo *TRI,
+                      MachineInstr *MI) {
+  for (MCSubRegIterator Subreg(Reg, TRI, /* IncludeSelf */ true);
+       Subreg.isValid(); ++Subreg)
+    if (MI->getParent()->computeRegisterLiveness(TRI, *Subreg, MI) !=
+        MachineBasicBlock::LQR_Dead)
+      return true;
+  return false;
+}
+bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+                                      MachineFunction &MF, MachineInstr *MI,
                                       unsigned NumBytes) {
   // This optimisation potentially adds lots of load and store
   // micro-operations, it's only really a great benefit to code-size.
-  if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
+  if (!Subtarget.isMinSize())
     return false;
 
   // If only one register is pushed/popped, LLVM can use an LDR/STR
@@ -1911,7 +1929,6 @@ bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
   for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
     RegList.push_back(MI->getOperand(i));
 
-  MachineBasicBlock *MBB = MI->getParent();
   const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
 
@@ -1932,9 +1949,11 @@ bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
     // registers live within the function we might clobber a return value
     // register; the other way a register can be live here is if it's
     // callee-saved.
+    // TODO: Currently, computeRegisterLiveness() does not report "live" if a
+    // sub reg is live. When computeRegisterLiveness() works for sub reg, it
+    // can replace isAnySubRegLive().
     if (isCalleeSavedRegister(CurReg, CSRegs) ||
-        MBB->computeRegisterLiveness(TRI, CurReg, MI) !=
-            MachineBasicBlock::LQR_Dead) {
+        isAnySubRegLive(CurReg, TRI, MI)) {
       // VFP pops don't allow holes in the register list, so any skip is fatal
       // for our transformation. GPR pops do, so we should just keep looking.
       if (IsVFPPushPop)
@@ -2159,7 +2178,7 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
       // Walk down one instruction which is potentially an 'and'.
       const MachineInstr &Copy = *MI;
       MachineBasicBlock::iterator AND(
-        llvm::next(MachineBasicBlock::iterator(MI)));
+        std::next(MachineBasicBlock::iterator(MI)));
       if (AND == MI->getParent()->end()) return false;
       MI = AND;
       return isSuitableForMask(MI, Copy.getOperand(0).getReg(),
@@ -2236,8 +2255,9 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   if (CmpMask != ~0) {
     if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
       MI = 0;
-      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg),
-           UE = MRI->use_end(); UI != UE; ++UI) {
+      for (MachineRegisterInfo::use_instr_iterator
+           UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
+           UI != UE; ++UI) {
         if (UI->getParent() != CmpInstr->getParent()) continue;
         MachineInstr *PotentialAND = &*UI;
         if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
@@ -2947,7 +2967,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
         break;
       }
       return UOps;
-    } else if (Subtarget.isCortexA8()) {
+    } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
       if (NumRegs < 4)
         return 2;
       // 4 registers would be issued: 2, 2.
@@ -2984,7 +3004,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
     return ItinData->getOperandCycle(DefClass, DefIdx);
 
   int DefCycle;
-  if (Subtarget.isCortexA8()) {
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     // (regno / 2) + (regno % 2) + 1
     DefCycle = RegNo / 2 + 1;
     if (RegNo % 2)
@@ -3025,7 +3045,7 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
     return ItinData->getOperandCycle(DefClass, DefIdx);
 
   int DefCycle;
-  if (Subtarget.isCortexA8()) {
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     // 4 registers would be issued: 1, 2, 1.
     // 5 registers would be issued: 1, 2, 2.
     DefCycle = RegNo / 2;
@@ -3059,7 +3079,7 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
     return ItinData->getOperandCycle(UseClass, UseIdx);
 
   int UseCycle;
-  if (Subtarget.isCortexA8()) {
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     // (regno / 2) + (regno % 2) + 1
     UseCycle = RegNo / 2 + 1;
     if (RegNo % 2)
@@ -3099,7 +3119,7 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
     return ItinData->getOperandCycle(UseClass, UseIdx);
 
   int UseCycle;
-  if (Subtarget.isCortexA8()) {
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     UseCycle = RegNo / 2;
     if (UseCycle < 2)
       UseCycle = 2;
@@ -3236,8 +3256,7 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
   Dist = 0;
 
   MachineBasicBlock::const_iterator I = MI; ++I;
-  MachineBasicBlock::const_instr_iterator II =
-    llvm::prior(I.getInstrIterator());
+  MachineBasicBlock::const_instr_iterator II = std::prev(I.getInstrIterator());
   assert(II->isInsideBundle() && "Empty bundle?");
 
   int Idx = -1;
@@ -3290,7 +3309,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
                             const MachineInstr *DefMI,
                             const MCInstrDesc *DefMCID, unsigned DefAlign) {
   int Adjust = 0;
-  if (Subtarget.isCortexA8() || Subtarget.isLikeA9()) {
+  if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID->getOpcode()) {
@@ -3591,7 +3610,8 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                   UseMCID, UseIdx, UseAlign);
 
   if (Latency > 1 &&
-      (Subtarget.isCortexA8() || Subtarget.isLikeA9())) {
+      (Subtarget.isCortexA8() || Subtarget.isLikeA9() ||
+       Subtarget.isCortexA7())) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID.getOpcode()) {
@@ -3684,6 +3704,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD3d16Pseudo:
     case ARM::VLD3d32Pseudo:
     case ARM::VLD1d64TPseudo:
+    case ARM::VLD1d64TPseudoWB_fixed:
     case ARM::VLD3d8Pseudo_UPD:
     case ARM::VLD3d16Pseudo_UPD:
     case ARM::VLD3d32Pseudo_UPD:
@@ -3700,6 +3721,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4d16Pseudo:
     case ARM::VLD4d32Pseudo:
     case ARM::VLD1d64QPseudo:
+    case ARM::VLD1d64QPseudoWB_fixed:
     case ARM::VLD4d8Pseudo_UPD:
     case ARM::VLD4d16Pseudo_UPD:
     case ARM::VLD4d32Pseudo_UPD:
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 93e5964..3ddddcb 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -14,7 +14,7 @@
 #ifndef ARMBASEINSTRUCTIONINFO_H
 #define ARMBASEINSTRUCTIONINFO_H
 
-#include "ARM.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -42,37 +42,37 @@ public:
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
 
-  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                              MachineBasicBlock::iterator &MBBI,
-                                              LiveVariables *LV) const;
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
 
   virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
   CreateTargetHazardRecognizer(const TargetMachine *TM,
-                               const ScheduleDAG *DAG) const;
+                               const ScheduleDAG *DAG) const override;
 
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
-                                     const ScheduleDAG *DAG) const;
+                                     const ScheduleDAG *DAG) const override;
 
   // Branch analysis.
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify = false) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const;
+  bool isPredicated(const MachineInstr *MI) const override;
 
   ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
     int PIdx = MI->findFirstPredOperandIdx();
@@ -80,76 +80,73 @@ public:
                       : ARMCC::AL;
   }
 
-  virtual
   bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
+                    const SmallVectorImpl<MachineOperand> &Pred) const override;
 
-  virtual
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
 
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
 
-  virtual bool isPredicable(MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
   virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
 
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-  virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                             int &FrameIndex) const;
-  virtual unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
-                                            int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const override;
+  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                    int &FrameIndex) const override;
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
 
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual void reMaterialize(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator MI,
-                             unsigned DestReg, unsigned SubIdx,
-                             const MachineInstr *Orig,
-                             const TargetRegisterInfo &TRI) const;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo &TRI) const override;
 
-  MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
+  MachineInstr *duplicate(MachineInstr *Orig,
+                          MachineFunction &MF) const override;
 
-  MachineInstr *commuteInstruction(MachineInstr*, bool=false) const;
+  MachineInstr *commuteInstruction(MachineInstr*,
+                                   bool=false) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                                      unsigned SubIdx, unsigned State,
                                      const TargetRegisterInfo *TRI) const;
 
-  virtual bool produceSameValue(const MachineInstr *MI0,
-                                const MachineInstr *MI1,
-                                const MachineRegisterInfo *MRI) const;
+  bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1,
+                        const MachineRegisterInfo *MRI) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
   /// determine if two loads are loading from the same base address. It should
   /// only return true if the base pointers are the same and the only
   /// differences between the two addresses is the offset. It also returns the
   /// offsets by reference.
-  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                                       int64_t &Offset1, int64_t &Offset2)const;
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
   /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
@@ -159,83 +156,79 @@ public:
   /// from the common base address. It returns true if it decides it's desirable
   /// to schedule the two loads together. "NumLoads" is the number of loads that
   /// have already been scheduled after Load1.
-  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
-                                       int64_t Offset1, int64_t Offset2,
-                                       unsigned NumLoads) const;
-
-  virtual bool isSchedulingBoundary(const MachineInstr *MI,
-                                    const MachineBasicBlock *MBB,
-                                    const MachineFunction &MF) const;
-
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCycles, unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const;
-
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumT, unsigned ExtraT,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumF, unsigned ExtraF,
-                                   const BranchProbability &Probability) const;
-
-  virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCycles,
-                                         const BranchProbability
-                                         &Probability) const {
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                           unsigned NumCycles, unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                           unsigned ExtraT, MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                          const BranchProbability &Probability) const override {
     return NumCycles == 1;
   }
 
-  virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                         MachineBasicBlock &FMBB) const;
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                              unsigned &SrcReg2, int &CmpMask,
-                              int &CmpValue) const;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
 
   /// optimizeCompareInstr - Convert the instruction to set the zero flag so
   /// that we can remove a "comparison with zero"; Remove a redundant CMP
   /// instruction if the flags can be updated in the same way by an earlier
   /// instruction such as SUB.
-  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                                    unsigned SrcReg2, int CmpMask, int CmpValue,
-                                    const MachineRegisterInfo *MRI) const;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
 
-  virtual bool analyzeSelect(const MachineInstr *MI,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             unsigned &TrueOp, unsigned &FalseOp,
-                             bool &Optimizable) const;
+  bool analyzeSelect(const MachineInstr *MI,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     unsigned &TrueOp, unsigned &FalseOp,
+                     bool &Optimizable) const override;
 
-  virtual MachineInstr *optimizeSelect(MachineInstr *MI, bool) const;
+  MachineInstr *optimizeSelect(MachineInstr *MI, bool) const override;
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
   /// instruction, try to fold the immediate into the use instruction.
-  virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                             unsigned Reg, MachineRegisterInfo *MRI) const;
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const override;
 
-  virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
-                                  const MachineInstr *MI) const;
+  unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+                          const MachineInstr *MI) const override;
 
-  virtual
   int getOperandLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx,
-                        const MachineInstr *UseMI, unsigned UseIdx) const;
-  virtual
+                        const MachineInstr *UseMI,
+                        unsigned UseIdx) const override;
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
-                        SDNode *UseNode, unsigned UseIdx) const;
+                        SDNode *UseNode, unsigned UseIdx) const override;
 
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
-  getExecutionDomain(const MachineInstr *MI) const;
-  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+  getExecutionDomain(const MachineInstr *MI) const override;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
 
   unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
-                                        const TargetRegisterInfo*) const;
+                                      const TargetRegisterInfo*) const override;
   void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
   /// Get the number of addresses by LDM or VLDM or zero for unknown.
   unsigned getNumLDMAddresses(const MachineInstr *MI) const;
 
@@ -264,24 +257,27 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
-  unsigned getPredicationCost(const MachineInstr *MI) const;
+  unsigned getPredicationCost(const MachineInstr *MI) const override;
 
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr *MI,
-                           unsigned *PredCost = 0) const;
+                           unsigned *PredCost = 0) const override;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
-                      SDNode *Node) const;
+                      SDNode *Node) const override;
 
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
                              const MachineRegisterInfo *MRI,
                              const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI, unsigned UseIdx) const;
+                             const MachineInstr *UseMI,
+                             unsigned UseIdx) const override;
   bool hasLowDefLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr *DefMI, unsigned DefIdx) const;
+                        const MachineInstr *DefMI,
+                        unsigned DefIdx) const override;
 
   /// verifyInstruction - Perform target specific instruction verification.
-  bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const;
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
 
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
@@ -417,7 +413,8 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 /// NumBytes. This can save a few bytes per function in code-size, but
 /// obviously generates more memory traffic. As such, it only takes
 /// effect in functions being optimised for size.
-bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI,
+bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+                                MachineFunction &MF, MachineInstr *MI,
                                 unsigned NumBytes);
 
 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 8717dc0..8130a2d 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -45,7 +45,7 @@ using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
   : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti),
-    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
+    FramePtr((STI.isTargetMachO() || STI.isThumb()) ? ARM::R7 : ARM::R11),
     BasePtr(ARM::R6) {
 }
 
@@ -408,6 +408,11 @@ emitLoadConstPool(MachineBasicBlock &MBB,
     .setMIFlags(MIFlags);
 }
 
+bool ARMBaseRegisterInfo::mayOverrideLocalAssignment() const {
+  // The native linux build hits a downstream codegen bug when this is enabled.
+  return STI.isTargetDarwin();
+}
+
 bool ARMBaseRegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
@@ -590,10 +595,8 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     AddDefaultCC(MIB);
 }
 
-void
-ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
-                                       unsigned BaseReg, int64_t Offset) const {
-  MachineInstr &MI = *I;
+void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const ARMBaseInstrInfo &TII =
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index e28fff6..66b3c82 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -14,7 +14,7 @@
 #ifndef ARMBASEREGISTERINFO_H
 #define ARMBASEREGISTERINFO_H
 
-#include "ARM.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
@@ -42,7 +42,7 @@ static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
     case R4:  case R5:  case R6:  case R7:
     case LR:  case SP:  case PC:
       return true;
-    case R8:  case R9:  case R10: case R11:
+    case R8:  case R9:  case R10: case R11: case R12:
       // For iOS we want r7 and lr to be next to each other.
       return !isIOS;
     default:
@@ -53,7 +53,7 @@ static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
 static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
   switch (Reg) {
-    case R8: case R9: case R10: case R11:
+    case R8: case R9: case R10: case R11: case R12:
       // iOS has this second area.
       return isIOS;
     default:
@@ -100,8 +100,9 @@ protected:
 
 public:
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const;
 
   /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
@@ -113,48 +114,51 @@ public:
   /// Should return NULL in the case that the calling convention does not have
   /// this property
   const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
-  
-  BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  const TargetRegisterClass*
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
-  const TargetRegisterClass*
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const;
+                               MachineFunction &MF) const override;
 
   void getRegAllocationHints(unsigned VirtReg,
                              ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
                              const MachineFunction &MF,
-                             const VirtRegMap *VRM) const;
+                             const VirtRegMap *VRM) const override;
 
   void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
-                          MachineFunction &MF) const;
+                          MachineFunction &MF) const override;
 
-  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
+  bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
 
   bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const;
-  int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const;
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+                                   int Idx) const override;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
   void materializeFrameBaseRegister(MachineBasicBlock *MBB,
                                     unsigned BaseReg, int FrameIdx,
-                                    int64_t Offset) const;
-  void resolveFrameIndex(MachineBasicBlock::iterator I,
-                         unsigned BaseReg, int64_t Offset) const;
-  bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
 
   bool cannotEliminateFrame(const MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
   unsigned getBaseRegister() const { return BasePtr; }
 
   bool isLowRegister(unsigned Reg) const;
@@ -164,25 +168,25 @@ public:
   /// specified immediate.
   virtual void emitLoadConstPool(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator &MBBI,
-                                 DebugLoc dl,
-                                 unsigned DestReg, unsigned SubIdx,
-                                 int Val,
-                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 DebugLoc dl, unsigned DestReg, unsigned SubIdx,
+                                 int Val, ARMCC::CondCodes Pred = ARMCC::AL,
                                  unsigned PredReg = 0,
                                  unsigned MIFlags = MachineInstr::NoFlags)const;
 
   /// Code Generation virtual methods...
-  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool mayOverrideLocalAssignment() const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
-  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
-  virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
 
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                   int SPAdj, unsigned FIOperandNum,
-                                   RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
deleted file mode 100644
index b16d4ef..0000000
--- a/lib/Target/ARM/ARMBuildAttrs.h
+++ /dev/null
@@ -1,170 +0,0 @@
-//===-- ARMBuildAttrs.h - ARM Build Attributes ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains enumerations and support routines for ARM build attributes
-// as defined in ARM ABI addenda document (ABI release 2.08).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __TARGET_ARMBUILDATTRS_H__
-#define __TARGET_ARMBUILDATTRS_H__
-
-namespace llvm {
-namespace ARMBuildAttrs {
-
-  enum SpecialAttr {
-    // This is for the .cpu asm attr. It translates into one or more
-    // AttrType (below) entries in the .ARM.attributes section in the ELF.
-    SEL_CPU
-  };
-
-  enum AttrType {
-    // Rest correspond to ELF/.ARM.attributes
-    File                      = 1,
-    Section                   = 2,
-    Symbol                    = 3,
-    CPU_raw_name              = 4,
-    CPU_name                  = 5,
-    CPU_arch                  = 6,
-    CPU_arch_profile          = 7,
-    ARM_ISA_use               = 8,
-    THUMB_ISA_use             = 9,
-    VFP_arch                  = 10,
-    WMMX_arch                 = 11,
-    Advanced_SIMD_arch        = 12,
-    PCS_config                = 13,
-    ABI_PCS_R9_use            = 14,
-    ABI_PCS_RW_data           = 15,
-    ABI_PCS_RO_data           = 16,
-    ABI_PCS_GOT_use           = 17,
-    ABI_PCS_wchar_t           = 18,
-    ABI_FP_rounding           = 19,
-    ABI_FP_denormal           = 20,
-    ABI_FP_exceptions         = 21,
-    ABI_FP_user_exceptions    = 22,
-    ABI_FP_number_model       = 23,
-    ABI_align8_needed         = 24,
-    ABI_align8_preserved      = 25,
-    ABI_enum_size             = 26,
-    ABI_HardFP_use            = 27,
-    ABI_VFP_args              = 28,
-    ABI_WMMX_args             = 29,
-    ABI_optimization_goals    = 30,
-    ABI_FP_optimization_goals = 31,
-    compatibility             = 32,
-    CPU_unaligned_access      = 34,
-    FP_HP_extension           = 36,
-    ABI_FP_16bit_format       = 38,
-    MPextension_use           = 42, // was 70, 2.08 ABI
-    DIV_use                   = 44,
-    nodefaults                = 64,
-    also_compatible_with      = 65,
-    T2EE_use                  = 66,
-    conformance               = 67,
-    Virtualization_use        = 68,
-    MPextension_use_old       = 70
-  };
-
-  // Magic numbers for .ARM.attributes
-  enum AttrMagic {
-    Format_Version  = 0x41
-  };
-
-  // Legal Values for CPU_arch, (=6), uleb128
-  enum CPUArch {
-    Pre_v4       = 0,
-    v4       = 1,   // e.g. SA110
-    v4T      = 2,   // e.g. ARM7TDMI
-    v5T      = 3,   // e.g. ARM9TDMI
-    v5TE     = 4,   // e.g. ARM946E_S
-    v5TEJ    = 5,   // e.g. ARM926EJ_S
-    v6       = 6,   // e.g. ARM1136J_S
-    v6KZ     = 7,   // e.g. ARM1176JZ_S
-    v6T2     = 8,   // e.g. ARM1156T2F_S
-    v6K      = 9,   // e.g. ARM1136J_S
-    v7       = 10,  // e.g. Cortex A8, Cortex M3
-    v6_M     = 11,  // e.g. Cortex M1
-    v6S_M    = 12,  // v6_M with the System extensions
-    v7E_M    = 13,  // v7_M with DSP extensions
-    v8       = 14   // v8, AArch32
-  };
-
-  enum CPUArchProfile { // (=7), uleb128
-    Not_Applicable = 0, // pre v7, or cross-profile code
-    ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8)
-    RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4)
-    MicroControllerProfile = (0x4D), // 'M' (e.g. for Cortex M3)
-    SystemProfile = (0x53) // 'S' Application or real-time profile
-  };
-
-  // The following have a lot of common use cases
-  enum {
-    Not_Allowed = 0,
-    Allowed = 1,
-
-    // Tag_ARM_ISA_use (=8), uleb128
-
-    // Tag_THUMB_ISA_use, (=9), uleb128
-    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
-
-    // Tag_FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
-    AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
-    AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA)
-    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31
-    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA)
-    AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
-    AllowFPARMv8A = 7, // Use of the ARM v8-A FP ISA was permitted
-    AllowFPARMv8B = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
-
-    // Tag_WMMX_arch, (=11), uleb128
-    AllowWMMXv1 = 1,  // The user permitted this entity to use WMMX v1
-    AllowWMMXv2 = 2,  // The user permitted this entity to use WMMX v2
-
-    // Tag_Advanced_SIMD_arch, (=12), uleb128
-    AllowNeon = 1, // SIMDv1 was permitted
-    AllowNeon2 = 2, // SIMDv2 was permitted (Half-precision FP, MAC operations)
-    AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
-
-    // Tag_ABI_FP_denormal, (=20), uleb128
-    PreserveFPSign = 2, // sign when flushed-to-zero is preserved
-
-    // Tag_ABI_FP_number_model, (=23), uleb128
-    AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
-    AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings
-
-    // Tag_ABI_HardFP_use, (=27), uleb128
-    HardFPImplied = 0, // FP use should be implied by Tag_FP_arch
-    HardFPSinglePrecision = 1, // Single-precision only
-
-    // Tag_ABI_VFP_args, (=28), uleb128
-    BaseAAPCS = 0,
-    HardFPAAPCS = 1,
-
-    // Tag_FP_HP_extension, (=36), uleb128
-    AllowHPFP = 1, // Allow use of Half Precision FP
-
-    // Tag_MPextension_use, (=42), uleb128
-    AllowMP = 1, // Allow use of MP extensions
-
-    // Tag_DIV_use, (=44), uleb128
-    AllowDIVIfExists = 0, // Allow hardware divide if available in arch, or no info exists.
-    DisallowDIV = 1, // Hardware divide explicitly disallowed
-    AllowDIVExt = 2, // Allow hardware divide as optional architecture extension above
-                     // the base arch specified by Tag_CPU_arch and Tag_CPU_arch_profile.
-
-    // Tag_Virtualization_use, (=68), uleb128
-    AllowTZ = 1,
-    AllowVirtualization = 2,
-    AllowTZVirtualization = 3
-  };
-
-} // namespace ARMBuildAttrs
-} // namespace llvm
-
-#endif // __TARGET_ARMBUILDATTRS_H__
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 9bea4b2..7cffd82 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -64,6 +64,13 @@ def FastCC_ARM_APCS : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
+
+  // CPRCs may be allocated to co-processor registers or the stack - they
+  // may never be allocated to core registers. 
+  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>,
+
   CCDelegateTo<CC_ARM_APCS>
 ]>;
 
@@ -114,10 +121,11 @@ def CC_ARM_AAPCS_Common : CallingConv<[
   CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8",
                        CCAssignToReg<[R0, R1, R2, R3]>>>,
 
-  CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, R3>>>,
-  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 8>>,
-  CCIfType<[v2f64], CCAssignToStack<16, 8>>
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
+  CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
+  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
 ]>;
 
 def RetCC_ARM_AAPCS_Common : CallingConv<[
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 96eb764..7359a11 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -57,7 +57,7 @@ namespace {
     bool IsPIC;
     bool IsThumb;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineModuleInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -76,9 +76,9 @@ namespace {
     /// machine instructions.
     uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
 
-    bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM Machine Code Emitter";
     }
 
@@ -88,11 +88,9 @@ namespace {
 
     void emitWordLE(unsigned Binary);
     void emitDWordLE(uint64_t Binary);
-    void emitConstantToMemory(unsigned CPI, const Constant *CV);
     void emitConstPoolInstruction(const MachineInstr &MI);
     void emitMOVi32immInstruction(const MachineInstr &MI);
     void emitMOVi2piecesInstruction(const MachineInstr &MI);
-    void emitLEApcrelInstruction(const MachineInstr &MI);
     void emitLEApcrelJTInstruction(const MachineInstr &MI);
     void emitPseudoMoveInstruction(const MachineInstr &MI);
     void addPCLabel(unsigned LabelID);
@@ -141,8 +139,6 @@ namespace {
 
     void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
 
-    void emitMiscInstruction(const MachineInstr &MI);
-
     void emitNEONLaneInstruction(const MachineInstr &MI);
     void emitNEONDupInstruction(const MachineInstr &MI);
     void emitNEON1RegModImmInstruction(const MachineInstr &MI);
@@ -174,13 +170,7 @@ namespace {
     unsigned NEONThumb2V8PostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
     unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
-      const {
-      if (IsThumb) {
-        Val &= 0x0FFFFFFF;
-        Val |= 0xE0000000;
-      }
-      return Val;
-    }
+      const { return 0; }
     unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
@@ -217,8 +207,6 @@ namespace {
       const { return 0; }
     unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    unsigned getT2AddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
     unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op)
@@ -229,8 +217,6 @@ namespace {
       const { return 0; }
     unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op)
-      const { return 0; }
     unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
@@ -248,10 +234,6 @@ namespace {
       const { return 0; }
     unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
                                             unsigned Op) const { return 0; }
-    unsigned getSsatBitPosValue(const MachineInstr &MI,
-                                unsigned Op) const { return 0; }
-    uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
-      const {return 0; }
     uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
       const { return 0; }
 
@@ -276,24 +258,10 @@ namespace {
       return Binary;
     }
 
-    unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op)
-      const {
-      const MCInstrDesc &MCID = MI.getDesc();
-      const MachineOperand &MO = MI.getOperand(Op);
-
-      unsigned Reloc = (MCID.Opcode == ARM::MOVi16 ?
-                       ARM::reloc_arm_movw : ARM::reloc_arm_movt);
-
-      if (!MO.isImm()) {
-        emitGlobalAddress(MO.getGlobal(), Reloc, true, false);
-        return 0;
-      }
-      unsigned Imm16 = static_cast<unsigned>(MO.getImm());
-      return Imm16;
+    unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const {
+      return 0;
     }
 
-    uint32_t getAddrMode2OpValue(const MachineInstr &MI, unsigned OpIdx)
-      const { return 0;}
     uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
       const { return 0;}
     uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx)
@@ -304,17 +272,14 @@ namespace {
       const { return 0; }
     uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    uint32_t getAddrModeSOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
     uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const {
-      // {12-9}  = reg
-      // {8}     = (U)nsigned (add == '1', sub == '0')
-      // {7-0}   = imm8
-      uint32_t Binary = 0;
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
       const MachineOperand &MO  = MI.getOperand(Op);
       const MachineOperand &MO1 = MI.getOperand(Op + 1);
       if (!MO.isReg()) {
@@ -336,8 +301,10 @@ namespace {
         isAdd = false;
       }
 
-      // If immediate offset is omitted, default to +0.
-      Binary |= 1 << 8;
+      uint32_t Binary = Imm12 & 0xfff;
+      if (isAdd)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
       return Binary;
     }
     unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op)
@@ -476,9 +443,6 @@ unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     return II->getRegisterInfo().getEncodingValue(MO.getReg());
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
-  else if (MO.isFPImm())
-    return static_cast<unsigned>(MO.getFPImm()->getValueAPF()
-                      .bitcastToAPInt().getHiBits(32).getLimitedValue());
   else if (MO.isGlobal())
     emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
   else if (MO.isSymbol())
@@ -631,9 +595,7 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
   case ARMII::VFPLdStMulFrm:
     emitVFPLoadStoreMultipleInstruction(MI);
     break;
-  case ARMII::VFPMiscFrm:
-    emitMiscInstruction(MI);
-    break;
+
   // NEON instructions.
   case ARMII::NGetLnFrm:
   case ARMII::NSetLnFrm:
@@ -655,56 +617,6 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
   MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
 
-void ARMCodeEmitter::emitConstantToMemory(unsigned CPI, const Constant *C) {
-  DEBUG({
-      errs() << "  ** Constant pool #" << CPI << " @ "
-             << (void*)MCE.getCurrentPCValue() << " ";
-      if (const Function *F = dyn_cast<Function>(C))
-        errs() << F->getName();
-      else
-        errs() << *C;
-      errs() << '\n';
-    });
-
-  switch (C->getValueID()) {
-  default: {
-    llvm_unreachable("Unable to handle this constantpool entry!");
-    break;
-  }
-  case Value::GlobalVariableVal: {
-    emitGlobalAddress(static_cast<const GlobalValue*>(C),
-                      ARM::reloc_arm_absolute, isa<Function>(C), false);
-    emitWordLE(0);
-    break;
-  }
-  case Value::ConstantIntVal: {
-    const ConstantInt *CI = static_cast<const ConstantInt*>(C);
-    uint32_t Val = *(uint32_t*)CI->getValue().getRawData();
-    emitWordLE(Val);
-    break;
-  }
-  case Value::ConstantFPVal: {
-    const ConstantFP *CFP = static_cast<const ConstantFP*>(C);
-    if (CFP->getType()->isFloatTy())
-      emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
-    else if (CFP->getType()->isDoubleTy())
-      emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
-    else {
-      llvm_unreachable("Unable to handle this constantpool entry!");
-    }
-    break;
-  }
-  case Value::ConstantArrayVal: {
-    const ConstantArray *CA = static_cast<const ConstantArray*>(C);
-    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      emitConstantToMemory(CPI, CA->getOperand(i));
-    break;
-  }
-  }
-
-  return;
-}
-
 void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
   unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
   unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
@@ -736,7 +648,35 @@ void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
     }
     emitWordLE(0);
   } else {
-    emitConstantToMemory(CPI, MCPE.Val.ConstVal);
+    const Constant *CV = MCPE.Val.ConstVal;
+
+    DEBUG({
+        errs() << "  ** Constant pool #" << CPI << " @ "
+               << (void*)MCE.getCurrentPCValue() << " ";
+        if (const Function *F = dyn_cast<Function>(CV))
+          errs() << F->getName();
+        else
+          errs() << *CV;
+        errs() << '\n';
+      });
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
+      emitWordLE(0);
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      uint32_t Val = uint32_t(*CI->getValue().getRawData());
+      emitWordLE(Val);
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      if (CFP->getType()->isFloatTy())
+        emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else if (CFP->getType()->isDoubleTy())
+        emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else {
+        llvm_unreachable("Unable to handle this constantpool entry!");
+      }
+    } else {
+      llvm_unreachable("Unable to handle this constantpool entry!");
+    }
   }
 }
 
@@ -818,32 +758,6 @@ void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
-void ARMCodeEmitter::emitLEApcrelInstruction(const MachineInstr &MI) {
-  // It's basically add r, pc, (LCPI - $+8)
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  unsigned Binary = 0;
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode S bit if MI modifies CPSR.
-  Binary |= getAddrModeSBit(MI, MCID);
-
-  // Encode Rd.
-  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
-
-  // Encode Rn which is PC.
-  Binary |= II->getRegisterInfo().getEncodingValue(ARM::PC) << ARMII::RegRnShift;
-
-  // Encode the displacement which is a so_imm.
-  // Set bit I(25) to identify this is the immediate form of <shifter_op>
-  Binary |= 1 << ARMII::I_BitShift;
-  emitConstPoolAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_so_imm_cp_entry);
-
-  emitWordLE(Binary);
-}
-
 void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
   // It's basically add r, pc, (LJTI - $+8)
 
@@ -921,14 +835,6 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
   switch (Opcode) {
   default:
     llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
-  case ARM::B:
-    emitBranchInstruction(MI);
-    break;
-  case ARM::BR_JTr:
-  case ARM::BR_JTm:
-  case ARM::BR_JTadd:
-    emitMiscBranchInstruction(MI);
-    break;
   case ARM::BX_CALL:
   case ARM::BMOVPCRX_CALL: {
     // First emit mov lr, pc
@@ -948,7 +854,8 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
     }
     break;
   }
-  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::CFI_INSTRUCTION:
+    break;
   case TargetOpcode::EH_LABEL:
     MCE.emitLabel(MI.getOperand(0).getMCSymbol());
     break;
@@ -959,9 +866,6 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
   case ARM::CONSTPOOL_ENTRY:
     emitConstPoolInstruction(MI);
     break;
-  case ARM::LDMIA_RET:
-    emitLoadStoreMultipleInstruction(MI);
-    break;
   case ARM::PICADD: {
     // Remember of the address of the PC label for relocation later.
     addPCLabel(MI.getOperand(2).getImm());
@@ -997,10 +901,7 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
     else
       emitMOVi2piecesInstruction(MI);
     break;
-  case ARM::LEApcrel:
-    // Materialize constantpool index address.
-    emitLEApcrelInstruction(MI);
-    break;
+
   case ARM::LEApcrelJT:
     // Materialize jumptable address.
     emitLEApcrelJTInstruction(MI);
@@ -1101,11 +1002,6 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
   // Part of binary is determined by TableGn.
   unsigned Binary = getBinaryCodeForInstr(MI);
 
-  if (MCID.Opcode == ARM::MOVi16 || MCID.Opcode == ARM::MOVTi16) {
-      emitWordLE(Binary);
-      return;
-  }
-
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
@@ -1208,17 +1104,11 @@ void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
 
   // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done.
   if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp ||
-      MI.getOpcode() == ARM::STRi12 || MI.getOpcode() == ARM::LDRBi12 ||
-      MI.getOpcode() == ARM::STRBi12) {
+      MI.getOpcode() == ARM::STRi12) {
     emitWordLE(Binary);
     return;
   }
 
-  if (MI.getOpcode() == ARM::BR_JTm)
-    Binary = 0x710F000;
-  else if (MI.getOpcode() == ARM::BR_JTr)
-    Binary = 0x1A0F000;
-
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
@@ -1374,11 +1264,6 @@ void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
   // Part of binary is determined by TableGn.
   unsigned Binary = getBinaryCodeForInstr(MI);
 
-  if (MCID.getOpcode() == ARM::LDMIA_RET) {
-    IsUpdating = true;
-    Binary |= 0x8B00000;
-  }
-
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
@@ -1586,10 +1471,6 @@ void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) {
   // Part of binary is determined by TableGn.
   unsigned Binary = getBinaryCodeForInstr(MI);
 
-  if (MCID.Opcode == ARM::B) {
-    Binary = 0xEA000000;
-  }
-
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
@@ -1664,10 +1545,9 @@ unsigned ARMCodeEmitter::encodeVFPRd(const MachineInstr &MI,
   unsigned Binary = 0;
   bool isSPVFP = ARM::SPRRegClass.contains(RegD);
   RegD = II->getRegisterInfo().getEncodingValue(RegD);
-  if (!isSPVFP) {
-    Binary |=  (RegD & 0x0F)       << ARMII::RegRdShift;
-    Binary |= ((RegD & 0x10) >> 4) << ARMII::D_BitShift;
-  } else {
+  if (!isSPVFP)
+    Binary |=   RegD               << ARMII::RegRdShift;
+  else {
     Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift;
     Binary |=  (RegD & 0x01)       << ARMII::D_BitShift;
   }
@@ -1680,10 +1560,9 @@ unsigned ARMCodeEmitter::encodeVFPRn(const MachineInstr &MI,
   unsigned Binary = 0;
   bool isSPVFP = ARM::SPRRegClass.contains(RegN);
   RegN = II->getRegisterInfo().getEncodingValue(RegN);
-  if (!isSPVFP) {
-    Binary |=  (RegN & 0x0F)       << ARMII::RegRnShift;
-    Binary |= ((RegN & 0x10) >> 4) << ARMII::N_BitShift;
-  } else {
+  if (!isSPVFP)
+    Binary |=   RegN               << ARMII::RegRnShift;
+  else {
     Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift;
     Binary |=  (RegN & 0x01)       << ARMII::N_BitShift;
   }
@@ -1696,10 +1575,9 @@ unsigned ARMCodeEmitter::encodeVFPRm(const MachineInstr &MI,
   unsigned Binary = 0;
   bool isSPVFP = ARM::SPRRegClass.contains(RegM);
   RegM = II->getRegisterInfo().getEncodingValue(RegM);
-  if (!isSPVFP) {
-    Binary |=  (RegM & 0x0F);
-    Binary |= ((RegM & 0x10) >> 4) << ARMII::M_BitShift;
-  } else {
+  if (!isSPVFP)
+    Binary |=   RegM;
+  else {
     Binary |= ((RegM & 0x1E) >> 1);
     Binary |=  (RegM & 0x01)       << ARMII::M_BitShift;
   }
@@ -1716,6 +1594,9 @@ void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) {
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
   unsigned OpIdx = 0;
+  assert((Binary & ARMII::D_BitShift) == 0 &&
+         (Binary & ARMII::N_BitShift) == 0 &&
+         (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!");
 
   // Encode Dd / Sd.
   Binary |= encodeVFPRd(MI, OpIdx++);
@@ -1805,12 +1686,6 @@ void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) {
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
-  if (MI.getOpcode() == ARM::VLDRS || MI.getOpcode() == ARM::VLDRD ||
-      MI.getOpcode() == ARM::VSTRS || MI.getOpcode() == ARM::VSTRD){
-    emitWordLE(Binary);
-    return;
-  }
-
   unsigned OpIdx = 0;
 
   // Encode Dd / Sd.
@@ -1886,26 +1761,6 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
-void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) {
-  unsigned Opcode = MI.getDesc().Opcode;
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  if (Opcode == ARM::FCONSTS) {
-    unsigned Imm = getMachineOpValue(MI, 1);
-    Binary &= ~(0x780000 >> 19);
-    Binary |= (Imm & 0x780000) >> 19;
-    Binary &= ~(0x3800000 >> 7);
-    Binary |= (Imm & 0x3800000) >> 7;
-    Binary = VFPThumb2PostEncoder(MI, Binary);
-  }
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  emitWordLE(Binary);
-}
-
 unsigned ARMCodeEmitter::encodeNEONRd(const MachineInstr &MI,
                                       unsigned OpIdx) const {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index cff5ce2..ba05171 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -266,9 +266,9 @@ namespace {
     static char ID;
     ARMConstantIslands() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM constant island placement and branch shortening pass";
     }
 
@@ -569,10 +569,10 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
   // Can't fall off end of function.
-  if (llvm::next(MBBI) == MBB->getParent()->end())
+  if (std::next(MBBI) == MBB->getParent()->end())
     return false;
 
-  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  MachineBasicBlock *NextBB = std::next(MBBI);
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I)
     if (*I == NextBB)
@@ -917,7 +917,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
                      CompareMBBNumbers);
   MachineBasicBlock* WaterBB = *IP;
   if (WaterBB == OrigBB)
-    WaterList.insert(llvm::next(IP), NewBB);
+    WaterList.insert(std::next(IP), NewBB);
   else
     WaterList.insert(IP, OrigBB);
   NewWaterList.insert(OrigBB);
@@ -1188,7 +1188,7 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
     return false;
 
   unsigned BestGrowth = ~0u;
-  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
        --IP) {
     MachineBasicBlock* WaterBB = *IP;
     // Check if water is in range and is either at a lower address than the
@@ -1249,7 +1249,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
             << format(", expected CPE offset %#x\n", CPEOffset));
-      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      NewMBB = std::next(MachineFunction::iterator(UserMBB));
       // Add an unconditional branch from UserMBB to fallthrough block.  Record
       // it for branch lengthening; this new branch will not get out of range,
       // but if the preceding conditional branch is out of range, the targets
@@ -1320,8 +1320,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   MachineInstr *LastIT = 0;
   for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
        Offset < BaseInsertOffset;
-       Offset += TII->GetInstSizeInBytes(MI),
-       MI = llvm::next(MI)) {
+       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
     assert(MI != UserMBB->end() && "Fell off end of block");
     if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
       CPUser &U = CPUsers[CPUIndex];
@@ -1393,7 +1392,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
       NewWaterList.insert(NewIsland);
 
     // The new CPE goes before the following block (NewMBB).
-    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+    NewMBB = std::next(MachineFunction::iterator(WaterBB));
 
   } else {
     // No water found.
@@ -1405,7 +1404,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // next iteration for constant pools, but in this context, we don't want
     // it.  Check for this so it will be removed from the WaterList.
     // Also remove any entry from NewWaterList.
-    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
     IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
     if (IP != WaterList.end())
       NewWaterList.erase(WaterBB);
@@ -1443,7 +1442,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
-  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+  adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
 
   // Finally, change the CPI in the instruction operand to be ID.
   for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1592,7 +1591,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
 
   ++NumCBrFixed;
   if (BMI != MI) {
-    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
         BMI->getOpcode() == Br.UncondBr) {
       // Last MI in the BB is an unconditional branch. Can we simply invert the
       // condition and swap destinations:
@@ -1622,7 +1621,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
   }
-  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
 
   DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
                << " also invert condition and change dest. to BB#"
@@ -2017,7 +2016,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   SmallVector<MachineOperand, 4> Cond;
   SmallVector<MachineOperand, 4> CondPrior;
   MachineFunction::iterator BBi = BB;
-  MachineFunction::iterator OldPrior = prior(BBi);
+  MachineFunction::iterator OldPrior = std::prev(BBi);
 
   // If the block terminator isn't analyzable, don't try to move the block
   bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond);
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 7ae7bf4..c7a8415 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -103,12 +103,12 @@ public:
   bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
   bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
 
-  virtual unsigned getRelocationInfo() const { return 2; }
+  unsigned getRelocationInfo() const override { return 2; }
 
-  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
-                                        unsigned Alignment);
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
 
-  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
   /// hasSameValue - Return true if this ARM constpool value can share the same
   /// constantpool entry as another ARM constpool value.
@@ -120,7 +120,7 @@ public:
       this->Modifier == A->Modifier;
   }
 
-  virtual void print(raw_ostream &O) const;
+  void print(raw_ostream &O) const override;
   void print(raw_ostream *O) const { if (O) print(*O); }
   void dump() const;
 };
@@ -164,16 +164,16 @@ public:
   const GlobalValue *getGV() const;
   const BlockAddress *getBlockAddress() const;
 
-  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
-                                        unsigned Alignment);
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
 
   /// hasSameValue - Return true if this ARM constpool value can share the same
   /// constantpool entry as another ARM constpool value.
-  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
 
-  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
-  virtual void print(raw_ostream &O) const;
+  void print(raw_ostream &O) const override;
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
   }
@@ -198,16 +198,16 @@ public:
 
   const char *getSymbol() const { return S.c_str(); }
 
-  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
-                                        unsigned Alignment);
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
 
-  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
   /// hasSameValue - Return true if this ARM constpool value can share the same
   /// constantpool entry as another ARM constpool value.
-  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
 
-  virtual void print(raw_ostream &O) const;
+  void print(raw_ostream &O) const override;
 
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isExtSymbol();
@@ -234,16 +234,16 @@ public:
 
   const MachineBasicBlock *getMBB() const { return MBB; }
 
-  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
-                                        unsigned Alignment);
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
 
-  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
   /// hasSameValue - Return true if this ARM constpool value can share the same
   /// constantpool entry as another ARM constpool value.
-  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
 
-  virtual void print(raw_ostream &O) const;
+  void print(raw_ostream &O) const override;
 
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isMachineBasicBlock();
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index e6f7f86..bd4ee44 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -18,11 +18,13 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
 #include "llvm/Target/TargetFrameLowering.h"
@@ -44,9 +46,9 @@ namespace {
     const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM pseudo instruction expansion pass";
     }
 
@@ -136,7 +138,9 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD1LNq8Pseudo_UPD,  ARM::VLD1LNd8_UPD, true, true, true,  EvenDblSpc, 1, 8 ,true},
 
 { ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_fixed,  ARM::VLD1d64Qwb_fixed,   true,  true, false, SingleSpc,  4, 1 ,false},
 { ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_fixed,  ARM::VLD1d64Twb_fixed,   true,  true, false, SingleSpc,  3, 1 ,false},
 
 { ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
 { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
@@ -477,6 +481,8 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
 
   if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
     MIB->addRegisterKilled(SrcReg, TRI, true);
+  else if (!SrcIsUndef)
+    MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg.
   TransferImpOps(MI, MIB, MIB);
 
   // Transfer memoperands.
@@ -602,8 +608,8 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill)  // Add an implicit kill for the super-reg.
-    MIB->addRegisterKilled(SrcReg, TRI, true);
+  // Add an implicit kill and use for the super-reg.
+  MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
   TransferImpOps(MI, MIB, MIB);
   MI.eraseFromParent();
 }
@@ -898,10 +904,61 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       return true;
     }
 
-    case ARM::MOV_ga_dyn:
+    case ARM::LDRLIT_ga_abs:
+    case ARM::LDRLIT_ga_pcrel:
+    case ARM::LDRLIT_ga_pcrel_ldr:
+    case ARM::tLDRLIT_ga_abs:
+    case ARM::tLDRLIT_ga_pcrel: {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      bool IsARM =
+          Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs;
+      bool IsPIC =
+          Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs;
+      unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
+      unsigned PICAddOpc =
+          IsARM
+              ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICADD : ARM::PICLDR)
+              : ARM::tPICADD;
+
+      // We need a new const-pool entry to load from.
+      MachineConstantPool *MCP = MBB.getParent()->getConstantPool();
+      unsigned ARMPCLabelIndex = 0;
+      MachineConstantPoolValue *CPV;
+
+      if (IsPIC) {
+        unsigned PCAdj = IsARM ? 8 : 4;
+        ARMPCLabelIndex = AFI->createPICLabelUId();
+        CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex,
+                                              ARMCP::CPValue, PCAdj);
+      } else
+        CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier);
+
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
+            .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+      if (IsARM)
+        MIB.addImm(0);
+      AddDefaultPred(MIB);
+
+      if (IsPIC) {
+        MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(ARMPCLabelIndex);
+
+        if (IsARM)
+          AddDefaultPred(MIB);
+      }
+
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::MOV_ga_pcrel:
     case ARM::MOV_ga_pcrel_ldr:
-    case ARM::t2MOV_ga_dyn:
     case ARM::t2MOV_ga_pcrel: {
       // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
       unsigned LabelId = AFI->createPICLabelUId();
@@ -910,14 +967,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       const MachineOperand &MO1 = MI.getOperand(1);
       const GlobalValue *GV = MO1.getGlobal();
       unsigned TF = MO1.getTargetFlags();
-      bool isARM = (Opcode != ARM::t2MOV_ga_pcrel && Opcode!=ARM::t2MOV_ga_dyn);
-      bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn);
+      bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
       unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
       unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel;
-      unsigned LO16TF = isPIC
-        ? ARMII::MO_LO16_NONLAZY_PIC : ARMII::MO_LO16_NONLAZY;
-      unsigned HI16TF = isPIC
-        ? ARMII::MO_HI16_NONLAZY_PIC : ARMII::MO_HI16_NONLAZY;
+      unsigned LO16TF = TF | ARMII::MO_LO16;
+      unsigned HI16TF = TF | ARMII::MO_HI16;
       unsigned PICAddOpc = isARM
         ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
         : ARM::tPICADD;
@@ -925,16 +979,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                                          TII->get(LO16Opc), DstReg)
         .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF)
         .addImm(LabelId);
-      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                                         TII->get(HI16Opc), DstReg)
+
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg)
         .addReg(DstReg)
         .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF)
         .addImm(LabelId);
-      if (!isPIC) {
-        TransferImpOps(MI, MIB1, MIB2);
-        MI.eraseFromParent();
-        return true;
-      }
 
       MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                          TII->get(PICAddOpc))
@@ -1030,33 +1079,6 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
-    case ARM::VDUPfqf:
-    case ARM::VDUPfdf:{
-      unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLN32q :
-        ARM::VDUPLN32d;
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
-      unsigned OpIdx = 0;
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned Lane = TRI->getEncodingValue(SrcReg) & 1;
-      unsigned DReg = TRI->getMatchingSuperReg(SrcReg,
-                            Lane & 1 ? ARM::ssub_1 : ARM::ssub_0,
-                            &ARM::DPR_VFP2RegClass);
-      // The lane is [0,1] for the containing DReg superregister.
-      // Copy the dst/src register operands.
-      MIB.addOperand(MI.getOperand(OpIdx++));
-      MIB.addReg(DReg);
-      ++OpIdx;
-      // Add the lane select operand.
-      MIB.addImm(Lane);
-      // Add the predicate operands.
-      MIB.addOperand(MI.getOperand(OpIdx++));
-      MIB.addOperand(MI.getOperand(OpIdx++));
-
-      TransferImpOps(MI, MIB, MIB);
-      MI.eraseFromParent();
-      return true;
-    }
 
     case ARM::VLD2q8Pseudo:
     case ARM::VLD2q16Pseudo:
@@ -1071,6 +1093,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD3d16Pseudo:
     case ARM::VLD3d32Pseudo:
     case ARM::VLD1d64TPseudo:
+    case ARM::VLD1d64TPseudoWB_fixed:
     case ARM::VLD3d8Pseudo_UPD:
     case ARM::VLD3d16Pseudo_UPD:
     case ARM::VLD3d32Pseudo_UPD:
@@ -1087,6 +1110,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD4d16Pseudo:
     case ARM::VLD4d32Pseudo:
     case ARM::VLD1d64QPseudo:
+    case ARM::VLD1d64QPseudoWB_fixed:
     case ARM::VLD4d8Pseudo_UPD:
     case ARM::VLD4d16Pseudo_UPD:
     case ARM::VLD4d32Pseudo_UPD:
@@ -1249,7 +1273,7 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
 
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
-    MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
     Modified |= ExpandMI(MBB, MBBI);
     MBBI = NMBBI;
   }
diff --git a/lib/Target/ARM/ARMFPUName.def b/lib/Target/ARM/ARMFPUName.def
index 9a1bbe7..1fef3b3 100644
--- a/lib/Target/ARM/ARMFPUName.def
+++ b/lib/Target/ARM/ARMFPUName.def
@@ -28,5 +28,6 @@ ARM_FPU_NAME("neon", NEON)
 ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4)
 ARM_FPU_NAME("neon-fp-armv8", NEON_FP_ARMV8)
 ARM_FPU_NAME("crypto-neon-fp-armv8", CRYPTO_NEON_FP_ARMV8)
+ARM_FPU_NAME("softvfp", SOFTVFP)
 
 #undef ARM_FPU_NAME
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index a4004f3..c442444 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -14,11 +14,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
-#include "ARMTargetMachine.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -30,18 +31,18 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -73,11 +74,12 @@ namespace {
      }
   } Address;
 
-class ARMFastISel : public FastISel {
+class ARMFastISel final : public FastISel {
 
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
   const ARMSubtarget *Subtarget;
+  Module &M;
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
@@ -91,6 +93,7 @@ class ARMFastISel : public FastISel {
     explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo)
     : FastISel(funcInfo, libInfo),
+      M(const_cast<Module&>(*funcInfo.Fn->getParent())),
       TM(funcInfo.MF->getTarget()),
       TII(*TM.getInstrInfo()),
       TLI(*TM.getTargetLowering()) {
@@ -102,8 +105,6 @@ class ARMFastISel : public FastISel {
 
     // Code from FastISel.cpp.
   private:
-    unsigned FastEmitInst_(unsigned MachineInstOpcode,
-                           const TargetRegisterClass *RC);
     unsigned FastEmitInst_r(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             unsigned Op0, bool Op0IsKill);
@@ -120,10 +121,6 @@ class ARMFastISel : public FastISel {
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              uint64_t Imm);
-    unsigned FastEmitInst_rf(unsigned MachineInstOpcode,
-                             const TargetRegisterClass *RC,
-                             unsigned Op0, bool Op0IsKill,
-                             const ConstantFP *FPImm);
     unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
                               const TargetRegisterClass *RC,
                               unsigned Op0, bool Op0IsKill,
@@ -132,22 +129,15 @@ class ARMFastISel : public FastISel {
     unsigned FastEmitInst_i(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             uint64_t Imm);
-    unsigned FastEmitInst_ii(unsigned MachineInstOpcode,
-                             const TargetRegisterClass *RC,
-                             uint64_t Imm1, uint64_t Imm2);
-
-    unsigned FastEmitInst_extractsubreg(MVT RetVT,
-                                        unsigned Op0, bool Op0IsKill,
-                                        uint32_t Idx);
 
     // Backend specific FastISel code.
   private:
-    virtual bool TargetSelectInstruction(const Instruction *I);
-    virtual unsigned TargetMaterializeConstant(const Constant *C);
-    virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
-    virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
-                                     const LoadInst *LI);
-    virtual bool FastLowerArguments();
+    bool TargetSelectInstruction(const Instruction *I) override;
+    unsigned TargetMaterializeConstant(const Constant *C) override;
+    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool FastLowerArguments() override;
   private:
   #include "ARMGenFastISel.inc"
 
@@ -302,7 +292,7 @@ unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
       // If it's not legal to COPY between the register classes, something
       // has gone very wrong before we got here.
       unsigned NewOp = createResultReg(RegClass);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(TargetOpcode::COPY), NewOp).addReg(Op));
       return NewOp;
     }
@@ -310,15 +300,6 @@ unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
   return Op;
 }
 
-unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass* RC) {
-  unsigned ResultReg = createResultReg(RC);
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg));
-  return ResultReg;
-}
-
 unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill) {
@@ -329,12 +310,12 @@ unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   // for this instruction.
   Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addReg(Op0, Op0IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+                            ResultReg).addReg(Op0, Op0IsKill * RegState::Kill));
   } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                    .addReg(Op0, Op0IsKill * RegState::Kill));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                    TII.get(TargetOpcode::COPY), ResultReg)
                    .addReg(II.ImplicitDefs[0]));
   }
@@ -354,14 +335,15 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   Op1 = constrainOperandRegClass(II, Op1, 2);
 
   if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addReg(Op1, Op1IsKill * RegState::Kill));
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill));
   } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
                    .addReg(Op1, Op1IsKill * RegState::Kill));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(TargetOpcode::COPY), ResultReg)
                    .addReg(II.ImplicitDefs[0]));
   }
@@ -383,16 +365,17 @@ unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
   Op2 = constrainOperandRegClass(II, Op1, 3);
 
   if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addReg(Op1, Op1IsKill * RegState::Kill)
-                   .addReg(Op2, Op2IsKill * RegState::Kill));
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill)
+            .addReg(Op2, Op2IsKill * RegState::Kill));
   } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
                    .addReg(Op1, Op1IsKill * RegState::Kill)
                    .addReg(Op2, Op2IsKill * RegState::Kill));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(TargetOpcode::COPY), ResultReg)
                    .addReg(II.ImplicitDefs[0]));
   }
@@ -410,39 +393,15 @@ unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
   // for this instruction.
   Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addImm(Imm));
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addImm(Imm));
   } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
                    .addImm(Imm));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                           TII.get(TargetOpcode::COPY), ResultReg)
-                   .addReg(II.ImplicitDefs[0]));
-  }
-  return ResultReg;
-}
-
-unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
-                                      const TargetRegisterClass *RC,
-                                      unsigned Op0, bool Op0IsKill,
-                                      const ConstantFP *FPImm) {
-  unsigned ResultReg = createResultReg(RC);
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  // Make sure the input operand is sufficiently constrained to be legal
-  // for this instruction.
-  Op0 = constrainOperandRegClass(II, Op0, 1);
-  if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addFPImm(FPImm));
-  } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addFPImm(FPImm));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(TargetOpcode::COPY), ResultReg)
                    .addReg(II.ImplicitDefs[0]));
   }
@@ -462,16 +421,17 @@ unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   Op0 = constrainOperandRegClass(II, Op0, 1);
   Op1 = constrainOperandRegClass(II, Op1, 2);
   if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addReg(Op0, Op0IsKill * RegState::Kill)
-                   .addReg(Op1, Op1IsKill * RegState::Kill)
-                   .addImm(Imm));
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill)
+            .addImm(Imm));
   } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
                    .addReg(Op1, Op1IsKill * RegState::Kill)
                    .addImm(Imm));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(TargetOpcode::COPY), ResultReg)
                    .addReg(II.ImplicitDefs[0]));
   }
@@ -485,58 +445,25 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+                            ResultReg).addImm(Imm));
   } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                    .addImm(Imm));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(TargetOpcode::COPY), ResultReg)
                    .addReg(II.ImplicitDefs[0]));
   }
   return ResultReg;
 }
 
-unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
-                                      const TargetRegisterClass *RC,
-                                      uint64_t Imm1, uint64_t Imm2) {
-  unsigned ResultReg = createResultReg(RC);
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  if (II.getNumDefs() >= 1) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
-                    .addImm(Imm1).addImm(Imm2));
-  } else {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
-                    .addImm(Imm1).addImm(Imm2));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(TargetOpcode::COPY),
-                            ResultReg)
-                    .addReg(II.ImplicitDefs[0]));
-  }
-  return ResultReg;
-}
-
-unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
-                                                 unsigned Op0, bool Op0IsKill,
-                                                 uint32_t Idx) {
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
-  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
-         "Cannot yet extract from physregs");
-
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                          DL, TII.get(TargetOpcode::COPY), ResultReg)
-                  .addReg(Op0, getKillRegState(Op0IsKill), Idx));
-  return ResultReg;
-}
-
 // TODO: Don't worry about 64-bit now, but when this is fixed remove the
 // checks from the various callers.
 unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
   if (VT == MVT::f64) return 0;
 
   unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VMOVSR), MoveReg)
                   .addReg(SrcReg));
   return MoveReg;
@@ -546,7 +473,7 @@ unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
   if (VT == MVT::i64) return 0;
 
   unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VMOVRS), MoveReg)
                   .addReg(SrcReg));
   return MoveReg;
@@ -572,9 +499,8 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
       Opc = ARM::FCONSTS;
     }
     unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
-                            DestReg)
-                    .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), DestReg).addImm(Imm));
     return DestReg;
   }
 
@@ -582,20 +508,20 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
   if (!Subtarget->hasVFP2()) return false;
 
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = TD.getPrefTypeAlignment(CFP->getType());
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
   if (Align == 0) {
     // TODO: Figure out if this is correct.
-    Align = TD.getTypeAllocSize(CFP->getType());
+    Align = DL.getTypeAllocSize(CFP->getType());
   }
   unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
   unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
 
   // The extra reg is for addrmode5.
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
-                          DestReg)
-                  .addConstantPoolIndex(Idx)
-                  .addReg(0));
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+          .addConstantPoolIndex(Idx)
+          .addReg(0));
   return DestReg;
 }
 
@@ -612,7 +538,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
     const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
       &ARM::GPRRegClass;
     unsigned ImmReg = createResultReg(RC);
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), ImmReg)
                     .addImm(CI->getZExtValue()));
     return ImmReg;
@@ -626,7 +552,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
     if (UseImm) {
       unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
       unsigned ImmReg = createResultReg(TLI.getRegClassFor(MVT::i32));
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(Opc), ImmReg)
                       .addImm(Imm));
       return ImmReg;
@@ -640,24 +566,25 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
 
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  unsigned Align = DL.getPrefTypeAlignment(C->getType());
   if (Align == 0) {
     // TODO: Figure out if this is correct.
-    Align = TD.getTypeAllocSize(C->getType());
+    Align = DL.getTypeAllocSize(C->getType());
   }
   unsigned Idx = MCP.getConstantPoolIndex(C, Align);
 
   if (isThumb2)
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(ARM::t2LDRpci), DestReg)
                     .addConstantPoolIndex(Idx));
-  else
+  else {
     // The extra immediate is for addrmode2.
     DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(ARM::LDRcp), DestReg)
                     .addConstantPoolIndex(Idx)
                     .addImm(0));
+  }
 
   return DestReg;
 }
@@ -673,37 +600,36 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
 
-  // FastISel TLS support on non-Darwin is broken, punt to SelectionDAG.
+  // FastISel TLS support on non-MachO is broken, punt to SelectionDAG.
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   bool IsThreadLocal = GVar && GVar->isThreadLocal();
-  if (!Subtarget->isTargetDarwin() && IsThreadLocal) return 0;
+  if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0;
 
   // Use movw+movt when possible, it avoids constant pool entries.
-  // Darwin targets don't support movt with Reloc::Static, see
-  // ARMTargetLowering::LowerGlobalAddressDarwin.  Other targets only support
-  // static movt relocations.
+  // Non-darwin targets only support static movt relocations in FastISel.
   if (Subtarget->useMovt() &&
-      Subtarget->isTargetDarwin() == (RelocM != Reloc::Static)) {
+      (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) {
     unsigned Opc;
+    unsigned char TF = 0;
+    if (Subtarget->isTargetMachO())
+      TF = ARMII::MO_NONLAZY;
+
     switch (RelocM) {
     case Reloc::PIC_:
       Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
       break;
-    case Reloc::DynamicNoPIC:
-      Opc = isThumb2 ? ARM::t2MOV_ga_dyn : ARM::MOV_ga_dyn;
-      break;
     default:
       Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
       break;
     }
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
-                            DestReg).addGlobalAddress(GV));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
   } else {
     // MachineConstantPool wants an explicit alignment.
-    unsigned Align = TD.getPrefTypeAlignment(GV->getType());
+    unsigned Align = DL.getPrefTypeAlignment(GV->getType());
     if (Align == 0) {
       // TODO: Figure out if this is correct.
-      Align = TD.getTypeAllocSize(GV->getType());
+      Align = DL.getTypeAllocSize(GV->getType());
     }
 
     if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_)
@@ -722,18 +648,18 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     MachineInstrBuilder MIB;
     if (isThumb2) {
       unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic;
-      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
-        .addConstantPoolIndex(Idx);
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                    DestReg).addConstantPoolIndex(Idx);
       if (RelocM == Reloc::PIC_)
         MIB.addImm(Id);
       AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
       DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
-      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
-                    DestReg)
-        .addConstantPoolIndex(Idx)
-        .addImm(0);
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::LDRcp), DestReg)
+                .addConstantPoolIndex(Idx)
+                .addImm(0);
       AddOptionalDefs(MIB);
 
       if (RelocM == Reloc::PIC_) {
@@ -741,7 +667,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
         unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
 
         MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                                          DL, TII.get(Opc), NewDestReg)
+                                          DbgLoc, TII.get(Opc), NewDestReg)
                                   .addReg(DestReg)
                                   .addImm(Id);
         AddOptionalDefs(MIB);
@@ -754,15 +680,15 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     MachineInstrBuilder MIB;
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
-      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                     TII.get(ARM::t2LDRi12), NewDestReg)
             .addReg(DestReg)
             .addImm(0);
     else
-      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRi12),
-                    NewDestReg)
-            .addReg(DestReg)
-            .addImm(0);
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::LDRi12), NewDestReg)
+                .addReg(DestReg)
+                .addImm(0);
     DestReg = NewDestReg;
     AddOptionalDefs(MIB);
   }
@@ -802,10 +728,12 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   // This will get lowered later into the correct offsets and registers
   // via rewriteXFrameIndex.
   if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
     const TargetRegisterClass* RC = TLI.getRegClassFor(VT);
     unsigned ResultReg = createResultReg(RC);
-    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0);
+
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), ResultReg)
                             .addFrameIndex(SI->second)
                             .addImm(0));
@@ -889,11 +817,11 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
            i != e; ++i, ++GTI) {
         const Value *Op = *i;
         if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-          const StructLayout *SL = TD.getStructLayout(STy);
+          const StructLayout *SL = DL.getStructLayout(STy);
           unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
           TmpOffset += SL->getElementOffset(Idx);
         } else {
-          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
           for (;;) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
@@ -979,7 +907,7 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
       (const TargetRegisterClass*)&ARM::GPRRegClass;
     unsigned ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), ResultReg)
                             .addFrameIndex(Addr.Base.FI)
                             .addImm(0));
@@ -1130,7 +1058,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   if (allocReg)
     ResultReg = createResultReg(RC);
   assert (ResultReg > 255 && "Expected an allocated virtual register.");
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
   AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
 
@@ -1138,7 +1066,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // load.  Now we must move from the GRP to the FP register.
   if (needVMOV) {
     unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(ARM::VMOVSR), MoveReg)
                     .addReg(ResultReg));
     ResultReg = MoveReg;
@@ -1180,7 +1108,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
         (const TargetRegisterClass*)&ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
       SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(Opc), Res)
                       .addReg(SrcReg).addImm(1));
       SrcReg = Res;
@@ -1227,7 +1155,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
       // Unaligned stores need special handling. Floats require word-alignment.
       if (Alignment && Alignment < 4) {
         unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
-        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(ARM::VMOVRS), MoveReg)
                         .addReg(SrcReg));
         SrcReg = MoveReg;
@@ -1252,7 +1180,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
 
   // Create the base instruction, then add the operands.
   SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(StrOpc))
                             .addReg(SrcReg);
   AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
@@ -1363,9 +1291,9 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
         return false;
 
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
       .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
-      FastEmitBranch(FBB, DL);
+      FastEmitBranch(FBB, DbgLoc);
       FuncInfo.MBB->addSuccessor(TBB);
       return true;
     }
@@ -1376,7 +1304,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
       unsigned OpReg = getRegForValue(TI->getOperand(0));
       OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(TstOpc))
                       .addReg(OpReg).addImm(1));
 
@@ -1387,10 +1315,10 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       }
 
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
       .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
 
-      FastEmitBranch(FBB, DL);
+      FastEmitBranch(FBB, DbgLoc);
       FuncInfo.MBB->addSuccessor(TBB);
       return true;
     }
@@ -1398,7 +1326,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
              dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    FastEmitBranch(Target, DL);
+    FastEmitBranch(Target, DbgLoc);
     return true;
   }
 
@@ -1414,8 +1342,10 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // the one-bit value left in the virtual register.
   unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
   CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
-                  .addReg(CmpReg).addImm(1));
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc))
+          .addReg(CmpReg)
+          .addImm(1));
 
   unsigned CCMode = ARMCC::NE;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
@@ -1424,9 +1354,9 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   }
 
   unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
                   .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
-  FastEmitBranch(FBB, DL);
+  FastEmitBranch(FBB, DbgLoc);
   FuncInfo.MBB->addSuccessor(TBB);
   return true;
 }
@@ -1436,8 +1366,8 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
   if (AddrReg == 0) return false;
 
   unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
-                  .addReg(AddrReg));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc)).addReg(AddrReg));
 
   const IndirectBrInst *IB = cast<IndirectBrInst>(I);
   for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
@@ -1542,11 +1472,11 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
   SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
   if (!UseImm) {
     SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
                     .addReg(SrcReg1).addReg(SrcReg2));
   } else {
     MachineInstrBuilder MIB;
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
       .addReg(SrcReg1);
 
     // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
@@ -1558,7 +1488,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
   // For floating point we need to move the result to a comparison register
   // that we can then use for branches.
   if (Ty->isFloatTy() || Ty->isDoubleTy())
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(ARM::FMSTAT)));
   return true;
 }
@@ -1586,7 +1516,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
   unsigned ZeroReg = TargetMaterializeConstant(Zero);
   // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), DestReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), DestReg)
           .addReg(ZeroReg).addImm(1)
           .addImm(ARMPred).addReg(ARM::CPSR);
 
@@ -1606,7 +1536,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
   if (Op == 0) return false;
 
   unsigned Result = createResultReg(&ARM::DPRRegClass);
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VCVTDS), Result)
                   .addReg(Op));
   UpdateValueMap(I, Result);
@@ -1625,7 +1555,7 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   if (Op == 0) return false;
 
   unsigned Result = createResultReg(&ARM::SPRRegClass);
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VCVTSD), Result)
                   .addReg(Op));
   UpdateValueMap(I, Result);
@@ -1670,9 +1600,8 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
   else return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
-                          ResultReg)
-                  .addReg(FP));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg).addReg(FP));
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1697,9 +1626,8 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
 
   // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
-                          ResultReg)
-                  .addReg(Op));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg).addReg(Op));
 
   // This result needs to be in an integer register, but the conversion only
   // takes place in fp-regs.
@@ -1746,8 +1674,10 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
 
   unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri;
   CondReg = constrainOperandRegClass(TII.get(CmpOpc), CondReg, 0);
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
-                  .addReg(CondReg).addImm(0));
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(CondReg)
+          .addImm(0));
 
   unsigned MovCCOpc;
   const TargetRegisterClass *RC;
@@ -1765,12 +1695,20 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   if (!UseImm) {
     Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
     Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
-    .addReg(Op2Reg).addReg(Op1Reg).addImm(ARMCC::NE).addReg(ARM::CPSR);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+            ResultReg)
+        .addReg(Op2Reg)
+        .addReg(Op1Reg)
+        .addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
   } else {
     Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
-    .addReg(Op1Reg).addImm(Imm).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+            ResultReg)
+        .addReg(Op1Reg)
+        .addImm(Imm)
+        .addImm(ARMCC::EQ)
+        .addReg(ARM::CPSR);
   }
   UpdateValueMap(I, ResultReg);
   return true;
@@ -1859,7 +1797,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
   SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg)
                   .addReg(SrcReg1).addReg(SrcReg2));
   UpdateValueMap(I, ResultReg);
@@ -1901,7 +1839,7 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
   if (Op2 == 0) return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg)
                   .addReg(Op1).addReg(Op2));
   UpdateValueMap(I, ResultReg);
@@ -2013,7 +1951,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(AdjStackDown))
                   .addImm(NumBytes));
 
@@ -2058,9 +1996,8 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
 
     // Now copy/store arg to correct locations.
     if (VA.isRegLoc() && !VA.needsCustom()) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              VA.getLocReg())
-        .addReg(Arg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
       RegArgs.push_back(VA.getLocReg());
     } else if (VA.needsCustom()) {
       // TODO: We need custom lowering for vector (v2f64) args.
@@ -2072,7 +2009,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       assert(VA.isRegLoc() && NextVA.isRegLoc() &&
              "We only handle register args!");
 
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(ARM::VMOVRRD), VA.getLocReg())
                       .addReg(NextVA.getLocReg(), RegState::Define)
                       .addReg(Arg));
@@ -2099,7 +2036,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
                              unsigned &NumBytes, bool isVarArg) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(AdjStackUp))
                   .addImm(NumBytes).addImm(0));
 
@@ -2116,7 +2053,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
       MVT DestVT = RVLocs[0].getValVT();
       const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
       unsigned ResultReg = createResultReg(DstRC);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(ARM::VMOVDRR), ResultReg)
                       .addReg(RVLocs[0].getLocReg())
                       .addReg(RVLocs[1].getLocReg()));
@@ -2137,7 +2074,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
       const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
 
       unsigned ResultReg = createResultReg(DstRC);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY),
               ResultReg).addReg(RVLocs[0].getLocReg());
       UsedRegs.push_back(RVLocs[0].getLocReg());
 
@@ -2214,15 +2152,15 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
     // Avoid a cross-class copy. This is very unlikely.
     if (!SrcRC->contains(DstReg))
       return false;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-            DstReg).addReg(SrcReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
 
     // Add register to return instruction.
     RetRegs.push_back(VA.getLocReg());
   }
 
   unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET;
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(RetOpc));
   AddOptionalDefs(MIB);
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
@@ -2243,7 +2181,7 @@ unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
   EVT LCREVT = TLI.getValueType(GVTy);
   if (!LCREVT.isSimple()) return 0;
 
-  GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
+  GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
                                        GlobalValue::ExternalLinkage, 0, Name);
   assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
   return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
@@ -2295,7 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     if (!isTypeLegal(ArgTy, ArgVT)) return false;
 
     ISD::ArgFlagsTy Flags;
-    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
     Args.push_back(Op);
@@ -2320,7 +2258,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // Issue the call.
   unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                                    DL, TII.get(CallOpc));
+                                    DbgLoc, TII.get(CallOpc));
   // BL / BLX don't take a predicate, but tBL / tBLX do.
   if (isThumb2)
     AddDefaultPred(MIB);
@@ -2428,7 +2366,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     if (Arg == 0)
       return false;
 
-    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
     Args.push_back(*i);
@@ -2461,7 +2399,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // Issue the call.
   unsigned CallOpc = ARMSelectCallOp(UseReg);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                                    DL, TII.get(CallOpc));
+                                    DbgLoc, TII.get(CallOpc));
 
   unsigned char OpFlags = 0;
 
@@ -2578,7 +2516,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
     while (Depth--) {
       DestReg = createResultReg(RC);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(LdrOpc), DestReg)
                       .addReg(SrcReg).addImm(0));
       SrcReg = DestReg;
@@ -2635,7 +2573,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     return SelectCall(&I, "memset");
   }
   case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(
       Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
     return true;
   }
@@ -2788,7 +2726,7 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm;
     bool isKill = 1 == Instr;
     MachineInstrBuilder MIB = BuildMI(
-        *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg);
+        *FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode), ResultReg);
     if (setsCPSR)
       MIB.addReg(ARM::CPSR, RegState::Define);
     SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
@@ -2866,7 +2804,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
   unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   if(ResultReg == 0) return false;
 
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg)
                             .addReg(Reg1);
 
@@ -3027,7 +2965,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   // Load value.
   if (isThumb2) {
     DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(ARM::t2LDRpci), DestReg1)
                     .addConstantPoolIndex(Idx));
     Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
@@ -3035,7 +2973,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
     // The extra immediate is for addrmode2.
     DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                            DL, TII.get(ARM::LDRcp), DestReg1)
+                            DbgLoc, TII.get(ARM::LDRcp), DestReg1)
                     .addConstantPoolIndex(Idx).addImm(0));
     Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs;
   }
@@ -3051,7 +2989,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
   GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                                    DL, TII.get(Opc), DestReg2)
+                                    DbgLoc, TII.get(Opc), DestReg2)
                             .addReg(DestReg1)
                             .addReg(GlobalBaseReg);
   if (!UseGOTOFF)
@@ -3125,7 +3063,8 @@ bool ARMFastISel::FastLowerArguments() {
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
     unsigned ResultReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
     UpdateValueMap(I, ResultReg);
   }
@@ -3141,7 +3080,7 @@ namespace llvm {
     const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
     // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
     bool UseFastISel = false;
-    UseFastISel |= Subtarget->isTargetIOS() && !Subtarget->isThumb1Only();
+    UseFastISel |= Subtarget->isTargetMachO() && !Subtarget->isThumb1Only();
     UseFastISel |= Subtarget->isTargetLinux() && !Subtarget->isThumb();
     UseFastISel |= Subtarget->isTargetNaCl() && !Subtarget->isThumb();
 
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index dafc4b3..a30f4cd 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -14,37 +14,27 @@
 #ifndef TARGET_ARM_FEATURES_H
 #define TARGET_ARM_FEATURES_H
 
-#include "ARM.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+
+template<typename InstrType> // could be MachineInstr or MCInst
+bool IsCPSRDead(InstrType *Instr);
 
 namespace llvm {
 
 template<typename InstrType> // could be MachineInstr or MCInst
-inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) {
+inline bool isV8EligibleForIT(InstrType *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
   case ARM::tADC:
   case ARM::tADDi3:
   case ARM::tADDi8:
-  case ARM::tADDrSPi:
   case ARM::tADDrr:
   case ARM::tAND:
   case ARM::tASRri:
   case ARM::tASRrr:
   case ARM::tBIC:
-  case ARM::tCMNz:
-  case ARM::tCMPi8:
-  case ARM::tCMPr:
   case ARM::tEOR:
-  case ARM::tLDRBi:
-  case ARM::tLDRBr:
-  case ARM::tLDRHi:
-  case ARM::tLDRHr:
-  case ARM::tLDRSB:
-  case ARM::tLDRSH:
-  case ARM::tLDRi:
-  case ARM::tLDRr:
-  case ARM::tLDRspi:
   case ARM::tLSLri:
   case ARM::tLSLrr:
   case ARM::tLSRri:
@@ -56,6 +46,24 @@ inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) {
   case ARM::tROR:
   case ARM::tRSB:
   case ARM::tSBC:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tSUBrr:
+    // Outside of an IT block, these set CPSR.
+    return IsCPSRDead(Instr);
+  case ARM::tADDrSPi:
+  case ARM::tCMNz:
+  case ARM::tCMPi8:
+  case ARM::tCMPr:
+  case ARM::tLDRBi:
+  case ARM::tLDRBr:
+  case ARM::tLDRHi:
+  case ARM::tLDRHr:
+  case ARM::tLDRSB:
+  case ARM::tLDRSH:
+  case ARM::tLDRi:
+  case ARM::tLDRr:
+  case ARM::tLDRspi:
   case ARM::tSTRBi:
   case ARM::tSTRBr:
   case ARM::tSTRHi:
@@ -63,21 +71,17 @@ inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) {
   case ARM::tSTRi:
   case ARM::tSTRr:
   case ARM::tSTRspi:
-  case ARM::tSUBi3:
-  case ARM::tSUBi8:
-  case ARM::tSUBrr:
   case ARM::tTST:
     return true;
 // there are some "conditionally deprecated" opcodes
   case ARM::tADDspr:
+  case ARM::tBLXr:
     return Instr->getOperand(2).getReg() != ARM::PC;
   // ADD PC, SP and BLX PC were always unpredictable,
   // now on top of it they're deprecated
   case ARM::tADDrSP:
   case ARM::tBX:
     return Instr->getOperand(0).getReg() != ARM::PC;
-  case ARM::tBLXr:
-    return Instr->getOperand(BLXOperandIndex).getReg() != ARM::PC;
   case ARM::tADDhirr:
     return Instr->getOperand(0).getReg() != ARM::PC &&
            Instr->getOperand(2).getReg() != ARM::PC;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index d32bdbc..36ecfca 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -14,15 +14,18 @@
 #include "ARMFrameLowering.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 
@@ -129,11 +132,24 @@ static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
                        MIFlags, Pred, PredReg);
 }
 
+static int sizeOfSPAdjustment(const MachineInstr *MI) {
+  assert(MI->getOpcode() == ARM::VSTMDDB_UPD);
+  int count = 0;
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4.
+  for (int i = MI->getNumOperands() - 1; i >= 4; --i)
+    count += 8;
+  return count;
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo  *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MCContext &Context = MMI.getContext();
+  const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseRegisterInfo *RegInfo =
     static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
   const ARMBaseInstrInfo &TII =
@@ -147,6 +163,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  int CFAOffset = 0;
 
   // Determine the sizes of each callee-save spill areas and record which frame
   // belongs to which callee-save spill areas.
@@ -159,22 +176,45 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
     return;
 
-  // Allocate the vararg register save area. This is not counted in NumBytes.
-  if (ArgRegsSaveSize)
+  // Allocate the vararg register save area.
+  if (ArgRegsSaveSize) {
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
                  MachineInstr::FrameSetup);
+    CFAOffset -= ArgRegsSaveSize;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 
   if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+    if (NumBytes - ArgRegsSaveSize != 0) {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
                    MachineInstr::FrameSetup);
+      CFAOffset -= NumBytes - ArgRegsSaveSize;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
     return;
   }
 
+  // Determine spill area sizes.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     int FI = CSI[i].getFrameIdx();
     switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12:
+      if (STI.isTargetMachO()) {
+        GPRCS2Size += 4;
+        break;
+      }
+      // fallthrough
     case ARM::R0:
     case ARM::R1:
     case ARM::R2:
@@ -188,18 +228,6 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
         FramePtrSpillFI = FI;
       GPRCS1Size += 4;
       break;
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-    case ARM::R12:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      if (STI.isTargetIOS())
-        GPRCS2Size += 4;
-      else
-        GPRCS1Size += 4;
-      break;
     default:
       // This is a DPR. Exclude the aligned DPRCS2 spills.
       if (Reg == ARM::D8)
@@ -210,18 +238,21 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   }
 
   // Move past area 1.
-  MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush;
+  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push,
+      DPRCSPush;
   if (GPRCS1Size > 0)
-    FramePtrPush = LastPush = MBBI++;
+    GPRCS1Push = LastPush = MBBI++;
 
   // Determine starting offsets of spill areas.
   bool HasFP = hasFP(MF);
-  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned DPRCSOffset  = NumBytes - (ArgRegsSaveSize + GPRCS1Size
+                                      + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
   int FramePtrOffsetInPush = 0;
   if (HasFP) {
-    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
+    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI)
+                           + GPRCS1Size + ArgRegsSaveSize;
     AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
   }
@@ -230,13 +261,12 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
   // Move past area 2.
-  if (GPRCS2Size > 0) {
-    LastPush = MBBI++;
-  }
+  if (GPRCS2Size > 0)
+    GPRCS2Push = LastPush = MBBI++;
 
   // Move past area 3.
   if (DPRCSSize > 0) {
-    LastPush = MBBI++;
+    DPRCSPush = MBBI;
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
@@ -254,11 +284,15 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   } else
     NumBytes = DPRCSOffset;
 
+  unsigned adjustedGPRCS1Size = GPRCS1Size;
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes)) {
-      if (LastPush == FramePtrPush)
+    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) {
+      if (LastPush == GPRCS1Push) {
         FramePtrOffsetInPush += NumBytes;
+        adjustedGPRCS1Size += NumBytes;
+        NumBytes = 0;
+      }
     } else
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
                    MachineInstr::FrameSetup);
@@ -275,17 +309,141 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       AFI->setShouldRestoreSPFromFP(true);
   }
 
+  if (adjustedGPRCS1Size > 0) {
+    CFAOffset -= adjustedGPRCS1Size;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    MachineBasicBlock::iterator Pos = ++GPRCS1Push;
+    BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      unsigned Reg = I->getReg();
+      int FI = I->getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.isTargetMachO())
+          break;
+        // fallthrough
+      case ARM::R0:
+      case ARM::R1:
+      case ARM::R2:
+      case ARM::R3:
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI)));
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+        break;
+      }
+    }
+  }
+
   // Set FP to point to the stack slot that contains the previous FP.
   // For iOS, FP is R7, which has now been stored in spill area 1.
   // Otherwise, if this is not iOS, all the callee-saved registers go
   // into spill area 1, including the FP in R11.  In either case, it
   // is in area one and the adjustment needs to take place just after
   // that push.
-  if (HasFP)
-    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII,
+  if (HasFP) {
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, GPRCS1Push, dl, TII,
                          FramePtr, ARM::SP, FramePtrOffsetInPush,
                          MachineInstr::FrameSetup);
+    if (FramePtrOffsetInPush) {
+      CFAOffset += FramePtrOffsetInPush;
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
+      BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+    } else {
+      unsigned CFIIndex =
+          MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  if (GPRCS2Size > 0) {
+    MachineBasicBlock::iterator Pos = ++GPRCS2Push;
+    if (!HasFP) {
+      CFAOffset -= GPRCS2Size;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      unsigned Reg = I->getReg();
+      int FI = I->getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.isTargetMachO()) {
+          unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
+          unsigned Offset = MFI->getObjectOffset(FI);
+          unsigned CFIIndex = MMI.addFrameInst(
+              MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+          BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+              .addCFIIndex(CFIIndex);
+        }
+        break;
+      }
+    }
+  }
 
+  if (DPRCSSize > 0) {
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    do {
+      MachineBasicBlock::iterator Push = DPRCSPush++;
+      if (!HasFP) {
+        CFAOffset -= sizeOfSPAdjustment(Push);;
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+        BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    } while (DPRCSPush->getOpcode() == ARM::VSTMDDB_UPD);
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      unsigned Reg = I->getReg();
+      int FI = I->getFrameIdx();
+      if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
+          (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
+        unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+        unsigned Offset = MFI->getObjectOffset(FI);
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+        BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    }
+  }
+
+  if (NumBytes) {
+    if (!HasFP) {
+      CFAOffset -= NumBytes;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
 
   if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
@@ -378,8 +536,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     return;
 
   if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+    if (NumBytes - ArgRegsSaveSize != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
     const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
@@ -392,7 +550,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     // Move SP to start of FP callee save spill area.
-    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+    NumBytes -= (ArgRegsSaveSize +
+                 AFI->getGPRCalleeSavedArea1Size() +
                  AFI->getGPRCalleeSavedArea2Size() +
                  AFI->getDPRCalleeSavedAreaSize());
 
@@ -430,7 +589,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                  ARM::SP)
             .addReg(FramePtr));
       }
-    } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
+    } else if (NumBytes &&
+               !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
         emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
@@ -453,7 +613,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     // Jump to label or value in register.
     if (RetOpcode == ARM::TCRETURNdi) {
       unsigned TCOpcode = STI.isThumb() ?
-               (STI.isTargetIOS() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
+               (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
                ARM::TAILJMPd;
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
       if (JumpTarget.isGlobal())
@@ -473,7 +633,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
         addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
-    MachineInstr *NewMI = prior(MBBI);
+    MachineInstr *NewMI = std::prev(MBBI);
     for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
       NewMI->addOperand(MBBI->getOperand(i));
 
@@ -598,7 +758,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     unsigned LastReg = 0;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetIOS())) continue;
+      if (!(Func)(Reg, STI.isTargetMachO())) continue;
 
       // D-registers in the aligned area DPRCS2 are NOT spilled here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -644,6 +804,11 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
       AddDefaultPred(MIB);
     }
     Regs.clear();
+
+    // Put any subsequent vpush instructions before this one: they will refer to
+    // higher register numbers so need to be pushed first in order to preserve
+    // monotonicity.
+    --MI;
   }
 }
 
@@ -671,7 +836,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     bool DeleteRet = false;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetIOS())) continue;
+      if (!(Func)(Reg, STI.isTargetMachO())) continue;
 
       // The aligned reloads from area DPRCS2 are not inserted here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -727,6 +892,10 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       AddDefaultPred(MIB);
     }
     Regs.clear();
+
+    // Put any subsequent vpop instructions after this one: they will refer to
+    // higher register numbers so need to be popped afterwards.
+    ++MI;
   }
 }
 
@@ -858,7 +1027,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   }
 
   // The last spill instruction inserted should kill the scratch register r4.
-  llvm::prior(MI)->addRegisterKilled(ARM::R4, TRI);
+  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
 }
 
 /// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an
@@ -968,7 +1137,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
                    .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
 
   // Last store kills r4.
-  llvm::prior(MI)->addRegisterKilled(ARM::R4, TRI);
+  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
 }
 
 bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
@@ -1220,7 +1389,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (Spilled) {
       NumGPRSpills++;
 
-      if (!STI.isTargetIOS()) {
+      if (!STI.isTargetMachO()) {
         if (Reg == ARM::LR)
           LRSpilled = true;
         CS1Spilled = true;
@@ -1242,7 +1411,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         break;
       }
     } else {
-      if (!STI.isTargetIOS()) {
+      if (!STI.isTargetMachO()) {
         UnspilledCS1GPRs.push_back(Reg);
         continue;
       }
@@ -1444,3 +1613,366 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
+/// Get the minimum constant for ARM that is greater than or equal to the
+/// argument. In ARM, constants can have any value that can be produced by
+/// rotating an 8-bit value to the right by an even number of bits within a
+/// 32-bit word.
+static uint32_t alignToARMConstant(uint32_t Value) {
+  unsigned Shifted = 0;
+
+  if (Value == 0)
+      return 0;
+
+  while (!(Value & 0xC0000000)) {
+      Value = Value << 2;
+      Shifted += 2;
+  }
+
+  bool Carry = (Value & 0x00FFFFFF);
+  Value = ((Value & 0xFF000000) >> 24) + Carry;
+
+  if (Value & 0x0000100)
+      Value = Value & 0x000001FC;
+
+  if (Shifted > 24)
+      Value = Value >> (Shifted - 24);
+  else
+      Value = Value << (24 - Shifted);
+
+  return Value;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual
+// stack limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+// Adjust the function prologue to enable split stacks. This currently only
+// supports android and linux.
+//
+// The ABI of the segmented stack prologue is a little arbitrarily chosen, but
+// must be well defined in order to allow for consistent implementations of the
+// __morestack helper function. The ABI is also not a normal ABI in that it
+// doesn't follow the normal calling conventions because this allows the
+// prologue of each function to be optimized further.
+//
+// Currently, the ABI looks like (when calling __morestack)
+//
+//  * r4 holds the minimum stack size requested for this function call
+//  * r5 holds the stack size of the arguments to the function
+//  * the beginning of the function is 3 instructions after the call to
+//    __morestack
+//
+// Implementations of __morestack should use r4 to allocate a new stack, r5 to
+// place the arguments on to the new stack, and the 3-instruction knowledge to
+// jump directly to the body of the function when working on the new stack.
+//
+// An old (and possibly no longer compatible) implementation of __morestack for
+// ARM can be found at [1].
+//
+// [1] - https://github.com/mozilla/rust/blob/86efd9/src/rt/arch/arm/morestack.S
+void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+  unsigned Opcode;
+  unsigned CFIIndex;
+  const ARMSubtarget *ST = &MF.getTarget().getSubtarget<ARMSubtarget>();
+  bool Thumb = ST->isThumb();
+
+  // Sadly, this currently doesn't support varargs, platforms other than
+  // android/linux. Note that thumb1/thumb2 are support for android/linux.
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!ST->isTargetAndroid() && !ST->isTargetLinux())
+    report_fatal_error("Segmented stacks not supported on this platfrom.");
+
+  MachineBasicBlock &prologueMBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MCContext &Context = MMI.getContext();
+  const MCRegisterInfo *MRI = Context.getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL;
+
+  // Use R4 and R5 as scratch registers.
+  // We save R4 and R5 before use and restore them before leaving the function.
+  unsigned ScratchReg0 = ARM::R4;
+  unsigned ScratchReg1 = ARM::R5;
+  uint64_t AlignedStackSize;
+
+  MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *PostStackMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *AllocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock();
+
+  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+                                          e = prologueMBB.livein_end();
+       i != e; ++i) {
+    AllocMBB->addLiveIn(*i);
+    GetMBB->addLiveIn(*i);
+    McrMBB->addLiveIn(*i);
+    PrevStackMBB->addLiveIn(*i);
+    PostStackMBB->addLiveIn(*i);
+  }
+
+  MF.push_front(PostStackMBB);
+  MF.push_front(AllocMBB);
+  MF.push_front(GetMBB);
+  MF.push_front(McrMBB);
+  MF.push_front(PrevStackMBB);
+
+  // The required stack size that is aligned to ARM constant criterion.
+  uint64_t StackSize = MFI->getStackSize();
+
+  AlignedStackSize = alignToARMConstant(StackSize);
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable;
+
+  // We will use two of the callee save registers as scratch registers so we
+  // need to save those registers onto the stack.
+  // We will use SR0 to hold stack limit and SR1 to hold the stack size
+  // requested and arguments for __morestack().
+  // SR0: Scratch Register #0
+  // SR1: Scratch Register #1
+  // push {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)))
+        .addReg(ScratchReg0).addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
+                   .addReg(ARM::SP, RegState::Define).addReg(ARM::SP))
+        .addReg(ScratchReg0).addReg(ScratchReg1);
+  }
+
+  // Emit the relevant DWARF information about the change in stack pointer as
+  // well as where to find both r4 and r5 (the callee-save registers)
+  CFIIndex =
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+      nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+      nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // mov SR1, sp
+  if (Thumb) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+                      .addReg(ARM::SP));
+  } else if (CompareStackPointer) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+                      .addReg(ARM::SP)).addReg(0);
+  }
+
+  // sub SR1, sp, #StackSize
+  if (!CompareStackPointer && Thumb) {
+    AddDefaultPred(
+        AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1))
+            .addReg(ScratchReg1).addImm(AlignedStackSize));
+  } else if (!CompareStackPointer) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+                      .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0);
+  }
+
+  if (Thumb && ST->isThumb1Only()) {
+    unsigned PCLabelId = ARMFI->createPICLabelUId();
+    ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
+        MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0);
+    MachineConstantPool *MCP = MF.getConstantPool();
+    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment());
+
+    // ldr SR0, [pc, offset(STACK_LIMIT)]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+                      .addConstantPoolIndex(CPI));
+
+    // ldr SR0, [SR0]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
+                      .addReg(ScratchReg0).addImm(0));
+  } else {
+    // Get TLS base address from the coprocessor
+    // mrc p15, #0, SR0, c13, c0, #3
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+                     .addImm(15)
+                     .addImm(0)
+                     .addImm(13)
+                     .addImm(0)
+                     .addImm(3));
+
+    // Use the last tls slot on android and a private field of the TCP on linux.
+    assert(ST->isTargetAndroid() || ST->isTargetLinux());
+    unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1;
+
+    // Get the stack limit from the right offset
+    // ldr SR0, [sr0, #4 * TlsOffset]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+                      .addReg(ScratchReg0).addImm(4 * TlsOffset));
+  }
+
+  // Compare stack limit with stack size requested.
+  // cmp SR0, SR1
+  Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
+  AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode))
+                    .addReg(ScratchReg0)
+                    .addReg(ScratchReg1));
+
+  // This jump is taken if StackLimit < SP - stack required.
+  Opcode = Thumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(GetMBB, DL, TII.get(Opcode)).addMBB(PostStackMBB)
+       .addImm(ARMCC::LO)
+       .addReg(ARM::CPSR);
+
+
+  // Calling __morestack(StackSize, Size of stack arguments).
+  // __morestack knows that the stack size requested is in SR0(r4)
+  // and amount size of stack arguments is in SR1(r5).
+
+  // Pass first argument for the __morestack by Scratch Register #0.
+  //   The amount size of stack required
+  if (Thumb) {
+    AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8),
+                                        ScratchReg0)).addImm(AlignedStackSize));
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+                      .addImm(AlignedStackSize)).addReg(0);
+  }
+  // Pass second argument for the __morestack by Scratch Register #1.
+  //   The amount size of stack consumed to save function arguments.
+  if (Thumb) {
+    AddDefaultPred(
+        AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1))
+            .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())));
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+                   .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())))
+                   .addReg(0);
+  }
+
+  // push {lr} - Save return address of this function.
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)))
+        .addReg(ARM::LR);
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+        .addReg(ARM::LR);
+  }
+
+  // Emit the DWARF info about the change in stack as well as where to find the
+  // previous link register
+  CFIIndex =
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Call __morestack().
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL)))
+        .addExternalSymbol("__morestack");
+  } else {
+    BuildMI(AllocMBB, DL, TII.get(ARM::BL))
+        .addExternalSymbol("__morestack");
+  }
+
+  // pop {lr} - Restore return address of this original function.
+  if (Thumb) {
+    if (ST->isThumb1Only()) {
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+                     .addReg(ScratchReg0);
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
+                     .addReg(ScratchReg0));
+    } else {
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
+                     .addReg(ARM::LR, RegState::Define)
+                     .addReg(ARM::SP, RegState::Define)
+                     .addReg(ARM::SP)
+                     .addImm(4));
+    }
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ARM::LR);
+  }
+
+  // Restore SR0 and SR1 in case of __morestack() was called.
+  // __morestack() will skip PostStackMBB block so we need to restore
+  // scratch registers from here.
+  // pop {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  }
+
+  // Update the CFA offset now that we've popped
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // bx lr - Return from this function.
+  Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET;
+  AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode)));
+
+  // Restore SR0 and SR1 in case of __morestack() was not called.
+  // pop {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  }
+
+  // Update the CFA offset now that we've popped
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Tell debuggers that r4 and r5 are now the same as they were in the
+  // previous function, that they're the "Same Value".
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createSameValue(
+      nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createSameValue(
+      nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Organizing MBB lists
+  PostStackMBB->addSuccessor(&prologueMBB);
+
+  AllocMBB->addSuccessor(PostStackMBB);
+
+  GetMBB->addSuccessor(PostStackMBB);
+  GetMBB->addSuccessor(AllocMBB);
+
+  McrMBB->addSuccessor(GetMBB);
+
+  PrevStackMBB->addSuccessor(McrMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index efa255a..524ee36 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -14,7 +14,6 @@
 #ifndef ARM_FRAMEINFO_H
 #define ARM_FRAMEINFO_H
 
-#include "ARM.h"
 #include "ARMSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
@@ -33,31 +32,32 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
-  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const;
-  int ResolveFrameIndexReference(const MachineFunction &MF,
-                                 int FI,
+                             unsigned &FrameReg) const override;
+  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
                                  unsigned &FrameReg, int SPAdj) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
+                                            RegScavenger *RS) const override;
+
+  void adjustForSegmentedStacks(MachineFunction &MF) const;
 
  private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
@@ -71,10 +71,10 @@ public:
                    bool(*Func)(unsigned, bool),
                    unsigned NumAlignedDPRCS2Regs) const;
 
-  virtual void eliminateCallFramePseudoInstr(
-                                    MachineFunction &MF,
-                                    MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI) const;
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index c69d313..61d4e12 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -57,7 +57,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
-          I = llvm::prior(I);
+          I = std::prev(I);
           DefMI = &*I;
         }
       }
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index e1dcec3..e88cd0d 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -37,11 +37,11 @@ public:
     : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
       LastMI(0) {}
 
-  virtual HazardType getHazardType(SUnit *SU, int Stalls);
-  virtual void Reset();
-  virtual void EmitInstruction(SUnit *SU);
-  virtual void AdvanceCycle();
-  virtual void RecedeCycle();
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void Reset() override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 87d1522..70e11c5 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 
@@ -73,11 +72,11 @@ public:
       Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "ARM Instruction Selection";
   }
 
-  virtual void PreprocessISelDAG();
+  void PreprocessISelDAG() override;
 
   /// getI32Imm - Return a target constant of type i32 with the specified
   /// value.
@@ -85,7 +84,7 @@ public:
     return CurDAG->getTargetConstant(Imm, MVT::i32);
   }
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
 
   bool hasNoVMLxHazardUse(SDNode *N) const;
@@ -253,13 +252,10 @@ private:
 
   SDNode *SelectConcatVector(SDNode *N);
 
-  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32, unsigned Op64);
-
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 
   // Form pairs of consecutive R, S, D, or Q registers.
   SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1);
@@ -414,8 +410,8 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (!CheckVMLxHazard)
     return true;
 
-  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9() &&
-      !Subtarget->isSwift())
+  if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() &&
+      !Subtarget->isCortexA9() && !Subtarget->isSwift())
     return true;
 
   if (!N->hasOneUse())
@@ -534,8 +530,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-        !(Subtarget->useMovt() &&
-                     N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
     } else
       Base = N;
@@ -702,8 +697,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = CurDAG->getTargetFrameIndex(FI,
                                          getTargetLowering()->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               !(Subtarget->useMovt() &&
-                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
@@ -963,8 +957,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
       Base = CurDAG->getTargetFrameIndex(FI,
                                          getTargetLowering()->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               !(Subtarget->useMovt() &&
-                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
@@ -1141,8 +1134,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
 
   if (!CurDAG->isBaseWithConstantOffset(N)) {
     if (N.getOpcode() == ARMISD::Wrapper &&
-        !(Subtarget->useMovt() &&
-          N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
     } else {
       Base = N;
@@ -1278,8 +1270,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-               !(Subtarget->useMovt() &&
-                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::TargetConstantPool)
         return false;  // We want to select t2LDRpci instead.
@@ -1412,7 +1403,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
 
 bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
                                                 SDValue &OffImm) {
-  // This *must* succeed since it's used for the irreplacable ldrex and strex
+  // This *must* succeed since it's used for the irreplaceable ldrex and strex
   // instructions.
   Base = N;
   OffImm = CurDAG->getTargetConstant(0, MVT::i32);
@@ -1673,9 +1664,61 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs,
   return CurDAG->getTargetConstant(Alignment, MVT::i32);
 }
 
+static bool isVLDfixed(unsigned Opc)
+{
+  switch (Opc) {
+  default: return false;
+  case ARM::VLD1d8wb_fixed : return true;
+  case ARM::VLD1d16wb_fixed : return true;
+  case ARM::VLD1d64Qwb_fixed : return true;
+  case ARM::VLD1d32wb_fixed : return true;
+  case ARM::VLD1d64wb_fixed : return true;
+  case ARM::VLD1d64TPseudoWB_fixed : return true;
+  case ARM::VLD1d64QPseudoWB_fixed : return true;
+  case ARM::VLD1q8wb_fixed : return true;
+  case ARM::VLD1q16wb_fixed : return true;
+  case ARM::VLD1q32wb_fixed : return true;
+  case ARM::VLD1q64wb_fixed : return true;
+  case ARM::VLD2d8wb_fixed : return true;
+  case ARM::VLD2d16wb_fixed : return true;
+  case ARM::VLD2d32wb_fixed : return true;
+  case ARM::VLD2q8PseudoWB_fixed : return true;
+  case ARM::VLD2q16PseudoWB_fixed : return true;
+  case ARM::VLD2q32PseudoWB_fixed : return true;
+  case ARM::VLD2DUPd8wb_fixed : return true;
+  case ARM::VLD2DUPd16wb_fixed : return true;
+  case ARM::VLD2DUPd32wb_fixed : return true;
+  }
+}
+
+static bool isVSTfixed(unsigned Opc)
+{
+  switch (Opc) {
+  default: return false;
+  case ARM::VST1d8wb_fixed : return true;
+  case ARM::VST1d16wb_fixed : return true;
+  case ARM::VST1d32wb_fixed : return true;
+  case ARM::VST1d64wb_fixed : return true;
+  case ARM::VST1q8wb_fixed : return true; 
+  case ARM::VST1q16wb_fixed : return true; 
+  case ARM::VST1q32wb_fixed : return true; 
+  case ARM::VST1q64wb_fixed : return true; 
+  case ARM::VST1d64TPseudoWB_fixed : return true;
+  case ARM::VST1d64QPseudoWB_fixed : return true;
+  case ARM::VST2d8wb_fixed : return true;
+  case ARM::VST2d16wb_fixed : return true;
+  case ARM::VST2d32wb_fixed : return true;
+  case ARM::VST2q8PseudoWB_fixed : return true;
+  case ARM::VST2q16PseudoWB_fixed : return true;
+  case ARM::VST2q32PseudoWB_fixed : return true;
+  }
+}
+
 // Get the register stride update opcode of a VLD/VST instruction that
 // is otherwise equivalent to the given fixed stride updating instruction.
 static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  assert((isVLDfixed(Opc) || isVSTfixed(Opc))
+    && "Incorrect fixed stride updating instruction.");
   switch (Opc) {
   default: break;
   case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register;
@@ -1686,6 +1729,10 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register;
   case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register;
   case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register;
+  case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register;
+  case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
+  case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
+  case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
 
   case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
   case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
@@ -1785,11 +1832,11 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
       // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
       // case entirely when the rest are updated to that form, too.
-      if ((NumVecs == 1 || NumVecs == 2) && !isa<ConstantSDNode>(Inc.getNode()))
+      if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode()))
         Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+      // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
       // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs != 1 && NumVecs != 2 && Opc != ARM::VLD1q64wb_fixed) ||
+      if ((NumVecs > 2 && !isVLDfixed(Opc)) ||
           !isa<ConstantSDNode>(Inc.getNode()))
         Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
     }
@@ -1937,11 +1984,12 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
       // case entirely when the rest are updated to that form, too.
       if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
         Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
+      // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
       // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs > 2 && Opc != ARM::VST1q64wb_fixed) ||
-          !isa<ConstantSDNode>(Inc.getNode()))
-        Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+      if  (!isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(Inc);
+      else if (NumVecs > 2 && !isVSTfixed(Opc))
+        Ops.push_back(Reg0);
     }
     Ops.push_back(SrcReg);
     Ops.push_back(Pred);
@@ -2361,38 +2409,6 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
 }
 
-SDNode *ARMDAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
-                                      unsigned Op16,unsigned Op32,
-                                      unsigned Op64) {
-  // Mostly direct translation to the given operations, except that we preserve
-  // the AtomicOrdering for use later on.
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-  EVT VT = AN->getMemoryVT();
-
-  unsigned Op;
-  SDVTList VTs = CurDAG->getVTList(AN->getValueType(0), MVT::Other);
-  if (VT == MVT::i8)
-    Op = Op8;
-  else if (VT == MVT::i16)
-    Op = Op16;
-  else if (VT == MVT::i32)
-    Op = Op32;
-  else if (VT == MVT::i64) {
-    Op = Op64;
-    VTs = CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other);
-  } else
-    llvm_unreachable("Unexpected atomic operation");
-
-  SmallVector<SDValue, 6> Ops;
-  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
-      Ops.push_back(AN->getOperand(i));
-
-  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
-  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
-
-  return CurDAG->SelectNodeTo(Node, Op, VTs, &Ops[0], Ops.size());
-}
-
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
 
@@ -2420,19 +2436,21 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     bool UseCP = true;
-    if (Subtarget->hasThumb2())
+    if (Subtarget->useMovt())
       // Thumb2-aware targets have the MOVT instruction, so all immediates can
       // be done with MOV + MOVT, at worst.
-      UseCP = 0;
+      UseCP = false;
     else {
       if (Subtarget->isThumb()) {
-        UseCP = (Val > 255 &&                          // MOV
-                 ~Val > 255 &&                         // MOV + MVN
-                 !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+        UseCP = (Val > 255 &&                                  // MOV
+                 ~Val > 255 &&                                 // MOV + MVN
+                 !ARM_AM::isThumbImmShiftedVal(Val) &&         // MOV + LSL
+                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
       } else
-        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
-                 ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
-                 !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&             // MOV
+                 ARM_AM::getSOImmVal(~Val) == -1 &&            // MVN
+                 !ARM_AM::isSOImmTwoPartVal(Val) &&            // two instrs.
+                 !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW
     }
 
     if (UseCP) {
@@ -2442,7 +2460,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                       getTargetLowering()->getPointerTy());
 
       SDNode *ResNode;
-      if (Subtarget->isThumb1Only()) {
+      if (Subtarget->isThumb()) {
         SDValue Pred = getAL(CurDAG);
         SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
         SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
@@ -2834,7 +2852,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
                                          ARM::VLD3d16Pseudo_UPD,
                                          ARM::VLD3d32Pseudo_UPD,
-                                         ARM::VLD1q64wb_fixed};
+                                         ARM::VLD1d64TPseudoWB_fixed};
     static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
                                           ARM::VLD3q16Pseudo_UPD,
                                           ARM::VLD3q32Pseudo_UPD };
@@ -2848,7 +2866,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
                                          ARM::VLD4d16Pseudo_UPD,
                                          ARM::VLD4d32Pseudo_UPD,
-                                         ARM::VLD1q64wb_fixed};
+                                         ARM::VLD1d64QPseudoWB_fixed};
     static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
                                           ARM::VLD4q16Pseudo_UPD,
                                           ARM::VLD4q32Pseudo_UPD };
@@ -2970,13 +2988,16 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_ldaexd:
     case Intrinsic::arm_ldrexd: {
-      SDValue MemAddr = N->getOperand(2);
       SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
-
+      SDValue MemAddr = N->getOperand(2);
       bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
-      unsigned NewOpc = isThumb ? ARM::t2LDREXD :ARM::LDREXD;
+
+      bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
+      unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
+                                : (IsAcquire ? ARM::LDAEXD : ARM::LDREXD);
 
       // arm_ldrexd returns a i64 value in {i32, i32}
       std::vector<EVT> ResTys;
@@ -3028,7 +3049,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ReplaceUses(SDValue(N, 2), OutChain);
       return NULL;
     }
-
+    case Intrinsic::arm_stlexd:
     case Intrinsic::arm_strexd: {
       SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
@@ -3054,7 +3075,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       Ops.push_back(CurDAG->getRegister(0, MVT::i32));
       Ops.push_back(Chain);
 
-      unsigned NewOpc = isThumb ? ARM::t2STREXD : ARM::STREXD;
+      bool IsRelease = IntNo == Intrinsic::arm_stlexd;
+      unsigned NewOpc = isThumb ? (IsRelease ? ARM::t2STLEXD : ARM::t2STREXD)
+                                : (IsRelease ? ARM::STLEXD : ARM::STREXD);
 
       SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
       // Transfer memoperands.
@@ -3263,91 +3286,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
-
-  case ISD::ATOMIC_LOAD:
-    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
-      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_LOAD_I64);
-    else
-      break;
-
-  case ISD::ATOMIC_STORE:
-    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
-      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_STORE_I64);
-    else
-      break;
-
-  case ISD::ATOMIC_LOAD_ADD:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_ADD_I8,
-                        ARM::ATOMIC_LOAD_ADD_I16,
-                        ARM::ATOMIC_LOAD_ADD_I32,
-                        ARM::ATOMIC_LOAD_ADD_I64);
-  case ISD::ATOMIC_LOAD_SUB:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_SUB_I8,
-                        ARM::ATOMIC_LOAD_SUB_I16,
-                        ARM::ATOMIC_LOAD_SUB_I32,
-                        ARM::ATOMIC_LOAD_SUB_I64);
-  case ISD::ATOMIC_LOAD_AND:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_AND_I8,
-                        ARM::ATOMIC_LOAD_AND_I16,
-                        ARM::ATOMIC_LOAD_AND_I32,
-                        ARM::ATOMIC_LOAD_AND_I64);
-  case ISD::ATOMIC_LOAD_OR:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_OR_I8,
-                        ARM::ATOMIC_LOAD_OR_I16,
-                        ARM::ATOMIC_LOAD_OR_I32,
-                        ARM::ATOMIC_LOAD_OR_I64);
-  case ISD::ATOMIC_LOAD_XOR:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_XOR_I8,
-                        ARM::ATOMIC_LOAD_XOR_I16,
-                        ARM::ATOMIC_LOAD_XOR_I32,
-                        ARM::ATOMIC_LOAD_XOR_I64);
-  case ISD::ATOMIC_LOAD_NAND:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_NAND_I8,
-                        ARM::ATOMIC_LOAD_NAND_I16,
-                        ARM::ATOMIC_LOAD_NAND_I32,
-                        ARM::ATOMIC_LOAD_NAND_I64);
-  case ISD::ATOMIC_LOAD_MIN:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_MIN_I8,
-                        ARM::ATOMIC_LOAD_MIN_I16,
-                        ARM::ATOMIC_LOAD_MIN_I32,
-                        ARM::ATOMIC_LOAD_MIN_I64);
-  case ISD::ATOMIC_LOAD_MAX:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_MAX_I8,
-                        ARM::ATOMIC_LOAD_MAX_I16,
-                        ARM::ATOMIC_LOAD_MAX_I32,
-                        ARM::ATOMIC_LOAD_MAX_I64);
-  case ISD::ATOMIC_LOAD_UMIN:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_UMIN_I8,
-                        ARM::ATOMIC_LOAD_UMIN_I16,
-                        ARM::ATOMIC_LOAD_UMIN_I32,
-                        ARM::ATOMIC_LOAD_UMIN_I64);
-  case ISD::ATOMIC_LOAD_UMAX:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_LOAD_UMAX_I8,
-                        ARM::ATOMIC_LOAD_UMAX_I16,
-                        ARM::ATOMIC_LOAD_UMAX_I32,
-                        ARM::ATOMIC_LOAD_UMAX_I64);
-  case ISD::ATOMIC_SWAP:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_SWAP_I8,
-                        ARM::ATOMIC_SWAP_I16,
-                        ARM::ATOMIC_SWAP_I32,
-                        ARM::ATOMIC_SWAP_I64);
-  case ISD::ATOMIC_CMP_SWAP:
-    return SelectAtomic(N,
-                        ARM::ATOMIC_CMP_SWAP_I8,
-                        ARM::ATOMIC_CMP_SWAP_I16,
-                        ARM::ATOMIC_CMP_SWAP_I32,
-                        ARM::ATOMIC_CMP_SWAP_I64);
   }
 
   return SelectCode(N);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 76a0a83..2ebad8e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -14,7 +14,6 @@
 
 #define DEBUG_TYPE "arm-isel"
 #include "ARMISelLowering.h"
-#include "ARM.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
@@ -46,7 +45,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include <utility>
 using namespace llvm;
@@ -55,12 +53,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 
-// This option should go away when tail calls fully work.
-static cl::opt<bool>
-EnableARMTailCalls("arm-tail-calls", cl::Hidden,
-  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
-  cl::init(false));
-
 cl::opt<bool>
 EnableARMLongCalls("arm-long-calls", cl::Hidden,
   cl::desc("Generate calls via indirect call instructions"),
@@ -156,12 +148,12 @@ void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
 }
 
 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM::QPRRegClass);
+  addRegisterClass(VT, &ARM::DPairRegClass);
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
 static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
+  if (TM.getSubtarget<ARMSubtarget>().isTargetMachO())
     return new TargetLoweringObjectFileMachO();
 
   return new ARMElfTargetObjectFile();
@@ -175,7 +167,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  if (Subtarget->isTargetIOS()) {
+  if (Subtarget->isTargetMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
         Subtarget->hasARMOps()) {
@@ -258,7 +250,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setLibcallName(RTLIB::SRL_I128, 0);
   setLibcallName(RTLIB::SRA_I128, 0);
 
-  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) {
+  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
+      !Subtarget->isTargetWindows()) {
     // Double-precision floating-point arithmetic helper functions
     // RTABI chapter 4.1.2, Table 2
     setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
@@ -733,8 +726,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (!Subtarget->isTargetDarwin()) {
-    // Non-Darwin platforms may return values in these registers via the
+  if (!Subtarget->isTargetMachO()) {
+    // Non-MachO platforms may return values in these registers via the
     // personality function.
     setExceptionPointerRegister(ARM::R0);
     setExceptionSelectorRegister(ARM::R1);
@@ -744,28 +737,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
   if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
-    // ATOMIC_FENCE needs custom lowering; the other 32-bit ones are legal and
-    // handled normally.
+    // ATOMIC_FENCE needs custom lowering; the others should have been expanded
+    // to ldrex/strex loops already.
     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
-    // Custom lowering for 64-bit ops
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
+
     // On v8, we have particularly efficient implementations of atomic fences
     // if they can be combined with nearby atomic loads and stores.
     if (!Subtarget->hasV8Ops()) {
       // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
       setInsertFencesForAtomic(true);
     }
-    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
   } else {
     // If there's anything we can use as a barrier, go through custom lowering
     // for ATOMIC_FENCE.
@@ -920,44 +901,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
-                                  bool isThumb2, unsigned &LdrOpc,
-                                  unsigned &StrOpc) {
-  static const unsigned LoadBares[4][2] =  {{ARM::LDREXB, ARM::t2LDREXB},
-                                            {ARM::LDREXH, ARM::t2LDREXH},
-                                            {ARM::LDREX,  ARM::t2LDREX},
-                                            {ARM::LDREXD, ARM::t2LDREXD}};
-  static const unsigned LoadAcqs[4][2] =   {{ARM::LDAEXB, ARM::t2LDAEXB},
-                                            {ARM::LDAEXH, ARM::t2LDAEXH},
-                                            {ARM::LDAEX,  ARM::t2LDAEX},
-                                            {ARM::LDAEXD, ARM::t2LDAEXD}};
-  static const unsigned StoreBares[4][2] = {{ARM::STREXB, ARM::t2STREXB},
-                                            {ARM::STREXH, ARM::t2STREXH},
-                                            {ARM::STREX,  ARM::t2STREX},
-                                            {ARM::STREXD, ARM::t2STREXD}};
-  static const unsigned StoreRels[4][2] =  {{ARM::STLEXB, ARM::t2STLEXB},
-                                            {ARM::STLEXH, ARM::t2STLEXH},
-                                            {ARM::STLEX,  ARM::t2STLEX},
-                                            {ARM::STLEXD, ARM::t2STLEXD}};
-
-  const unsigned (*LoadOps)[2], (*StoreOps)[2];
-  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    LoadOps = LoadAcqs;
-  else
-    LoadOps = LoadBares;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    StoreOps = StoreRels;
-  else
-    StoreOps = StoreBares;
-
-  assert(isPowerOf2_32(Size) && Size <= 8 &&
-         "unsupported size for atomic binary op!");
-
-  LdrOpc = LoadOps[Log2_32(Size)][isThumb2];
-  StrOpc = StoreOps[Log2_32(Size)][isThumb2];
-}
-
 // FIXME: It might make sense to define the representative register class as the
 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -1009,7 +952,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default: return 0;
   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
-  case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
   case ARMISD::CALL:          return "ARMISD::CALL";
@@ -1079,10 +1021,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VSHL:          return "ARMISD::VSHL";
   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
-  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
-  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
-  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
-  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
@@ -1450,9 +1388,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool isThisReturn   = false;
   bool isSibCall      = false;
+
   // Disable tail calls if they're not supported.
-  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
+  if (!Subtarget->supportsTailCall() || MF.getTarget().Options.DisableTailCalls)
     isTailCall = false;
+
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
@@ -1695,25 +1635,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const GlobalValue *GV = G->getGlobal();
     isDirect = true;
     bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
-    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
+    bool isStub = (isExt && Subtarget->isTargetMachO()) &&
                    getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || isStub;
     // ARM call to a local ARM function is predicable.
     isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
     // tBX takes a register source operand.
-    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
-      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV =
-        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
-      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
-      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
-      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
-                           getPointerTy(), Callee, PICLabel);
+    if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
+      Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
+                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy()));
     } else {
       // On ELF targets for PIC code, direct calls should go through the PLT
       unsigned OpFlags = 0;
@@ -1724,7 +1655,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
-    bool isStub = Subtarget->isTargetDarwin() &&
+    bool isStub = Subtarget->isTargetMachO() &&
                   getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || isStub;
     // tBX takes a register source operand.
@@ -1755,8 +1686,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
-  bool HasMinSizeAttr = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  bool HasMinSizeAttr = Subtarget->isMinSize();
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
@@ -1841,22 +1771,6 @@ ARMTargetLowering::HandleByVal(
           State->getCallOrPrologue() == Call) &&
          "unhandled ParmContext");
 
-  // For in-prologue parameters handling, we also introduce stack offset
-  // for byval registers: see CallingConvLower.cpp, CCState::HandleByVal.
-  // This behaviour outsides AAPCS rules (5.5 Parameters Passing) of how
-  // NSAA should be evaluted (NSAA means "next stacked argument address").
-  // So: NextStackOffset = NSAAOffset + SizeOfByValParamsStoredInRegs.
-  // Then: NSAAOffset = NextStackOffset - SizeOfByValParamsStoredInRegs.
-  unsigned NSAAOffset = State->getNextStackOffset();
-  if (State->getCallOrPrologue() != Call) {
-    for (unsigned i = 0, e = State->getInRegsParamsCount(); i != e; ++i) {
-      unsigned RB, RE;
-      State->getInRegsParamInfo(i, RB, RE);
-      assert(NSAAOffset >= (RE-RB)*4 &&
-             "Stack offset for byval regs doesn't introduced anymore?");
-      NSAAOffset -= (RE-RB)*4;
-    }
-  }
   if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
     if (Subtarget->isAAPCS_ABI() && Align > 4) {
       unsigned AlignInRegs = Align / 4;
@@ -1871,6 +1785,7 @@ ARMTargetLowering::HandleByVal(
       // all remained GPR regs. In that case we can't split parameter, we must
       // send it to stack. We also must set NCRN to R4, so waste all
       // remained registers.
+      const unsigned NSAAOffset = State->getNextStackOffset();
       if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
         while (State->AllocateReg(GPRArgRegs, 4))
           ;
@@ -1890,18 +1805,14 @@ ARMTargetLowering::HandleByVal(
       // allocate remained amount of registers we need.
       for (unsigned i = reg+1; i != ByValRegEnd; ++i)
         State->AllocateReg(GPRArgRegs, 4);
-      // At a call site, a byval parameter that is split between
-      // registers and memory needs its size truncated here.  In a
-      // function prologue, such byval parameters are reassembled in
-      // memory, and are not truncated.
-      if (State->getCallOrPrologue() == Call) {
-        // Make remained size equal to 0 in case, when
-        // the whole structure may be stored into registers.
-        if (size < excess)
-          size = 0;
-        else
-          size -= excess;
-      }
+      // A byval parameter that is split between registers and memory needs its
+      // size truncated here.
+      // In the case where the entire structure fits in registers, we set the
+      // size in memory to zero.
+      if (size < excess)
+        size = 0;
+      else
+        size -= excess;
     }
   }
 }
@@ -2310,10 +2221,10 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
 }
 
 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
+  if (!Subtarget->supportsTailCall())
     return false;
 
-  if (!CI->isTailCall())
+  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
     return false;
 
   return !Subtarget->isThumb1Only();
@@ -2538,56 +2449,20 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
-  // FIXME: Enable this for static codegen when tool issues are fixed.  Also
-  // update ARMFastISel::ARMMaterializeGV.
-  if (Subtarget->useMovt() && RelocM != Reloc::Static) {
+  if (Subtarget->useMovt())
     ++NumMovwMovt;
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes.
-    if (RelocM == Reloc::Static)
-      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
-                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
-
-    unsigned Wrapper = (RelocM == Reloc::PIC_)
-      ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
-    SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
-                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
-    if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
-      Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
-                           MachinePointerInfo::getGOT(),
-                           false, false, false, 0);
-    return Result;
-  }
-
-  unsigned ARMPCLabelIndex = 0;
-  SDValue CPAddr;
-  if (RelocM == Reloc::Static) {
-    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
-  } else {
-    ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
-    ARMPCLabelIndex = AFI->createPICLabelUId();
-    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
-    ARMConstantPoolValue *CPV =
-      ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
-                                      PCAdj);
-    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-  }
-  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
 
-  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                               MachinePointerInfo::getConstantPool(),
-                               false, false, false, 0);
-  SDValue Chain = Result.getValue(1);
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into multiple nodes
+  unsigned Wrapper =
+      RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper;
 
-  if (RelocM == Reloc::PIC_) {
-    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
-    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
-  }
+  SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
+  SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
 
   if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
-    Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
-                         false, false, false, 0);
-
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
   return Result;
 }
 
@@ -2807,11 +2682,11 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
   ArgRegsSize = NumGPRs * 4;
 
   // If parameter is split between stack and GPRs...
-  if (NumGPRs && Align == 8 &&
+  if (NumGPRs && Align > 4 &&
       (ArgRegsSize < ArgSize ||
         InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
-    // Add padding for part of param recovered from GPRs, so
-    // its last byte must be at address K*8 - 1.
+    // Add padding for part of param recovered from GPRs.  For example,
+    // if Align == 8, its last byte must be at address K*8 - 1.
     // We need to do it, since remained (stack) part of parameter has
     // stack alignment, and we need to "attach" "GPRs head" without gaps
     // to it:
@@ -2821,8 +2696,7 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
     //
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned Padding =
-        ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) -
-        (ArgRegsSize + AFI->getArgRegsSaveSize());
+        OffsetToAlignment(ArgRegsSize + AFI->getArgRegsSaveSize(), Align);
     ArgRegsSaveSize = ArgRegsSize + Padding;
   } else
     // We don't need to extend regs save size for byval parameters if they
@@ -2846,10 +2720,12 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
                                   unsigned OffsetFromOrigArg,
                                   unsigned ArgOffset,
                                   unsigned ArgSize,
-                                  bool ForceMutable) const {
+                                  bool ForceMutable,
+                                  unsigned ByValStoreOffset,
+                                  unsigned TotalArgRegsSaveSize) const {
 
   // Currently, two use-cases possible:
-  // Case #1. Non var-args function, and we meet first byval parameter.
+  // Case #1. Non-var-args function, and we meet first byval parameter.
   //          Setup first unallocated register as first byval register;
   //          eat all remained registers
   //          (these two actions are performed by HandleByVal method).
@@ -2883,7 +2759,6 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   // Note: once stack area for byval/varargs registers
   // was initialized, it can't be initialized again.
   if (ArgRegsSaveSize) {
-
     unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
 
     if (Padding) {
@@ -2892,11 +2767,18 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
       AFI->setStoredByValParamsPadding(Padding);
     }
 
-    int FrameIndex = MFI->CreateFixedObject(
-                      ArgRegsSaveSize,
-                      Padding + ArgOffset,
-                      false);
+    int FrameIndex = MFI->CreateFixedObject(ArgRegsSaveSize,
+                                            Padding +
+                                              ByValStoreOffset -
+                                              (int64_t)TotalArgRegsSaveSize,
+                                            false);
     SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
+    if (Padding) {
+       MFI->CreateFixedObject(Padding,
+                              ArgOffset + ByValStoreOffset -
+                                (int64_t)ArgRegsSaveSize,
+                              false);
+    }
 
     SmallVector<SDValue, 4> MemOps;
     for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
@@ -2924,10 +2806,16 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                           &MemOps[0], MemOps.size());
     return FrameIndex;
-  } else
+  } else {
+    if (ArgSize == 0) {
+      // We cannot allocate a zero-byte object for the first variadic argument,
+      // so just make up a size.
+      ArgSize = 4;
+    }
     // This will point to the next argument passed via stack.
     return MFI->CreateFixedObject(
-        4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable);
+      ArgSize, ArgOffset, !ForceMutable);
+  }
 }
 
 // Setup stack frame, the va_list pointer will start from.
@@ -2935,6 +2823,7 @@ void
 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                         SDLoc dl, SDValue &Chain,
                                         unsigned ArgOffset,
+                                        unsigned TotalArgRegsSaveSize,
                                         bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2946,7 +2835,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   // argument passed via stack.
   int FrameIndex =
     StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
-                   0, ArgOffset, 0, ForceMutable);
+                   0, ArgOffset, 0, ForceMutable, 0, TotalArgRegsSaveSize);
 
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
@@ -2983,6 +2872,51 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   // We also increase this value in case of varargs function.
   AFI->setArgRegsSaveSize(0);
 
+  unsigned ByValStoreOffset = 0;
+  unsigned TotalArgRegsSaveSize = 0;
+  unsigned ArgRegsSaveSizeMaxAlign = 4;
+
+  // Calculate the amount of stack space that we need to allocate to store
+  // byval and variadic arguments that are passed in registers.
+  // We need to know this before we allocate the first byval or variadic
+  // argument, as they will be allocated a stack slot below the CFA (Canonical
+  // Frame Address, the stack pointer at entry to the function).
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isMemLoc()) {
+      int index = VA.getValNo();
+      if (index != lastInsIndex) {
+        ISD::ArgFlagsTy Flags = Ins[index].Flags;
+        if (Flags.isByVal()) {
+          unsigned ExtraArgRegsSize;
+          unsigned ExtraArgRegsSaveSize;
+          computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProceed(),
+                         Flags.getByValSize(),
+                         ExtraArgRegsSize, ExtraArgRegsSaveSize);
+
+          TotalArgRegsSaveSize += ExtraArgRegsSaveSize;
+          if (Flags.getByValAlign() > ArgRegsSaveSizeMaxAlign)
+              ArgRegsSaveSizeMaxAlign = Flags.getByValAlign();
+          CCInfo.nextInRegsParam();
+        }
+        lastInsIndex = index;
+      }
+    }
+  }
+  CCInfo.rewindByValRegsInfo();
+  lastInsIndex = -1;
+  if (isVarArg) {
+    unsigned ExtraArgRegsSize;
+    unsigned ExtraArgRegsSaveSize;
+    computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0,
+                   ExtraArgRegsSize, ExtraArgRegsSaveSize);
+    TotalArgRegsSaveSize += ExtraArgRegsSaveSize;
+  }
+  // If the arg regs save area contains N-byte aligned values, the
+  // bottom of it must be at least N-byte aligned.
+  TotalArgRegsSaveSize = RoundUpToAlignment(TotalArgRegsSaveSize, ArgRegsSaveSizeMaxAlign);
+  TotalArgRegsSaveSize = std::min(TotalArgRegsSaveSize, 16U);
+
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
@@ -3081,18 +3015,23 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           // a tail call.
           if (Flags.isByVal()) {
             unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
+
+            ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign());
             int FrameIndex = StoreByValRegs(
                 CCInfo, DAG, dl, Chain, CurOrigArg,
                 CurByValIndex,
                 Ins[VA.getValNo()].PartOffset,
                 VA.getLocMemOffset(),
                 Flags.getByValSize(),
-                true /*force mutable frames*/);
+                true /*force mutable frames*/,
+                ByValStoreOffset,
+                TotalArgRegsSaveSize);
+            ByValStoreOffset += Flags.getByValSize();
+            ByValStoreOffset = std::min(ByValStoreOffset, 16U);
             InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
             CCInfo.nextInRegsParam();
           } else {
-            unsigned FIOffset = VA.getLocMemOffset() +
-                                AFI->getStoredByValParamsPadding();
+            unsigned FIOffset = VA.getLocMemOffset();
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
                                             FIOffset, true);
 
@@ -3110,7 +3049,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   // varargs
   if (isVarArg)
     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
-                         CCInfo.getNextStackOffset());
+                         CCInfo.getNextStackOffset(),
+                         TotalArgRegsSaveSize);
+
+  AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
 
   return Chain;
 }
@@ -3279,7 +3221,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
   if (CC == ISD::SETNE)
     return ISD::SETEQ;
-  return ISD::getSetCCSwappedOperands(CC);
+  return ISD::getSetCCInverse(CC, true);
 }
 
 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
@@ -3799,6 +3741,9 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -3822,7 +3767,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
+  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetMachO())
     ? ARM::R7 : ARM::R11;
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
@@ -4380,7 +4325,6 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       // Value = 0x0000nnff: Op=x, Cmode=1100.
       OpCmode = 0xc;
       Imm = SplatBits >> 8;
-      SplatBits |= 0xff;
       break;
     }
 
@@ -4389,7 +4333,6 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       // Value = 0x00nnffff: Op=x, Cmode=1101.
       OpCmode = 0xd;
       Imm = SplatBits >> 16;
-      SplatBits |= 0xffff;
       break;
     }
 
@@ -4420,7 +4363,6 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
     }
     // Op=1, Cmode=1110.
     OpCmode = 0x1e;
-    SplatBits = Val;
     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
     break;
   }
@@ -6031,40 +5973,11 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
     return Op;
 
-  // Aquire/Release load/store is not legal for targets without a
+  // Acquire/Release load/store is not legal for targets without a
   // dmb or equivalent available.
   return SDValue();
 }
 
-static void
-ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
-                    SelectionDAG &DAG) {
-  SDLoc dl(Node);
-  assert (Node->getValueType(0) == MVT::i64 &&
-          "Only know how to expand i64 atomics");
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Node->getOperand(0)); // Chain
-  Ops.push_back(Node->getOperand(1)); // Ptr
-  for(unsigned i=2; i<Node->getNumOperands(); i++) {
-    // Low part
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(i), DAG.getIntPtrConstant(0)));
-    // High part
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(i), DAG.getIntPtrConstant(1)));
-  }
-  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
-  SDValue Result =
-    DAG.getAtomic(Node->getOpcode(), dl, MVT::i64, Tys, Ops.data(), Ops.size(),
-                  cast<MemSDNode>(Node)->getMemOperand(), AN->getOrdering(),
-                  AN->getSynchScope());
-  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
-  Results.push_back(Result.getValue(2));
-}
-
 static void ReplaceREADCYCLECOUNTER(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
                                     SelectionDAG &DAG,
@@ -6109,7 +6022,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:
-    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
+    return Subtarget->isTargetMachO() ? LowerGlobalAddressDarwin(Op, DAG) :
       LowerGlobalAddressELF(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
@@ -6182,22 +6095,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
-  case ISD::ATOMIC_STORE:
-  case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_LOAD_ADD:
-  case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_NAND:
-  case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_SUB:
-  case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_SWAP:
-  case ISD::ATOMIC_CMP_SWAP:
-  case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_MAX:
-  case ISD::ATOMIC_LOAD_UMAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG);
-    return;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -6207,538 +6104,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
 //                           ARM Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
-MachineBasicBlock *
-ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
-                                     MachineBasicBlock *BB,
-                                     unsigned Size) const {
-  unsigned dest    = MI->getOperand(0).getReg();
-  unsigned ptr     = MI->getOperand(1).getReg();
-  unsigned oldval  = MI->getOperand(2).getReg();
-  unsigned newval  = MI->getOperand(3).getReg();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-  bool isThumb2 = Subtarget->isThumb2();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
-    (const TargetRegisterClass*)&ARM::rGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass);
-
-  if (isThumb2) {
-    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
-  }
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-
-  // loop1MBB:
-  //   ldrex dest, [ptr]
-  //   cmp dest, oldval
-  //   bne exitMBB
-  BB = loop1MBB;
-  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (ldrOpc == ARM::t2LDREX)
-    MIB.addImm(0);
-  AddDefaultPred(MIB);
-  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
-                 .addReg(dest).addReg(oldval));
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
-  BB->addSuccessor(loop2MBB);
-  BB->addSuccessor(exitMBB);
-
-  // loop2MBB:
-  //   strex scratch, newval, [ptr]
-  //   cmp scratch, #0
-  //   bne loop1MBB
-  BB = loop2MBB;
-  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
-  if (strOpc == ARM::t2STREX)
-    MIB.addImm(0);
-  AddDefaultPred(MIB);
-  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                 .addReg(scratch).addImm(0));
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                    unsigned Size, unsigned BinOpcode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-  bool isThumb2 = Subtarget->isThumb2();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  if (isThumb2) {
-    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
-  }
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::rGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  unsigned scratch = MRI.createVirtualRegister(TRC);
-  unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldrex dest, ptr
-  //   <binop> scratch2, dest, incr
-  //   strex scratch, scratch2, ptr
-  //   cmp scratch, #0
-  //   bne- loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (ldrOpc == ARM::t2LDREX)
-    MIB.addImm(0);
-  AddDefaultPred(MIB);
-  if (BinOpcode) {
-    // operand order needs to go the other way for NAND
-    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
-                     addReg(incr).addReg(dest)).addReg(0);
-    else
-      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
-                     addReg(dest).addReg(incr)).addReg(0);
-  }
-
-  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
-  if (strOpc == ARM::t2STREX)
-    MIB.addImm(0);
-  AddDefaultPred(MIB);
-  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                 .addReg(scratch).addImm(0));
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
-
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
-                                          MachineBasicBlock *BB,
-                                          unsigned Size,
-                                          bool signExtend,
-                                          ARMCC::CondCodes Cond) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  unsigned oldval = dest;
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-  bool isThumb2 = Subtarget->isThumb2();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  if (isThumb2) {
-    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
-  }
-
-  unsigned ldrOpc, strOpc, extendOpc;
-  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicBinaryMinMax!");
-  case 1:
-    extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
-    break;
-  case 2:
-    extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
-    break;
-  case 4:
-    extendOpc = 0;
-    break;
-  }
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::rGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  unsigned scratch = MRI.createVirtualRegister(TRC);
-  unsigned scratch2 = MRI.createVirtualRegister(TRC);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldrex dest, ptr
-  //   (sign extend dest, if required)
-  //   cmp dest, incr
-  //   cmov.cond scratch2, incr, dest
-  //   strex scratch, scratch2, ptr
-  //   cmp scratch, #0
-  //   bne- loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (ldrOpc == ARM::t2LDREX)
-    MIB.addImm(0);
-  AddDefaultPred(MIB);
-
-  // Sign extend the value, if necessary.
-  if (signExtend && extendOpc) {
-    oldval = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass
-                                                : &ARM::GPRnopcRegClass);
-    if (!isThumb2)
-      MRI.constrainRegClass(dest, &ARM::GPRnopcRegClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
-                     .addReg(dest)
-                     .addImm(0));
-  }
-
-  // Build compare and cmov instructions.
-  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
-                 .addReg(oldval).addReg(incr));
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
-         .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR);
-
-  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
-  if (strOpc == ARM::t2STREX)
-    MIB.addImm(0);
-  AddDefaultPred(MIB);
-  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                 .addReg(scratch).addImm(0));
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
-
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
-                                      unsigned Op1, unsigned Op2,
-                                      bool NeedsCarry, bool IsCmpxchg,
-                                      bool IsMinMax, ARMCC::CondCodes CC) const {
-  // This also handles ATOMIC_SWAP and ATOMIC_STORE, indicated by Op1==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  bool isStore = (MI->getOpcode() == ARM::ATOMIC_STORE_I64);
-  unsigned offset = (isStore ? -2 : 0);
-  unsigned destlo = MI->getOperand(0).getReg();
-  unsigned desthi = MI->getOperand(1).getReg();
-  unsigned ptr = MI->getOperand(offset+2).getReg();
-  unsigned vallo = MI->getOperand(offset+3).getReg();
-  unsigned valhi = MI->getOperand(offset+4).getReg();
-  unsigned OrdIdx = offset + (IsCmpxchg ? 7 : 5);
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(OrdIdx).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-  bool isThumb2 = Subtarget->isThumb2();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  if (isThumb2) {
-    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(vallo, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(valhi, &ARM::rGPRRegClass);
-  }
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *contBB = 0, *cont2BB = 0;
-  if (IsCmpxchg || IsMinMax)
-    contBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  if (IsCmpxchg)
-    cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-
-  MF->insert(It, loopMBB);
-  if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
-  if (IsCmpxchg) MF->insert(It, cont2BB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  unsigned storesuccess = MRI.createVirtualRegister(TRC);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldrexd r2, r3, ptr
-  //   <binopa> r0, r2, incr
-  //   <binopb> r1, r3, incr
-  //   strexd storesuccess, r0, r1, ptr
-  //   cmp storesuccess, #0
-  //   bne- loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-
-  if (!isStore) {
-    // Load
-    if (isThumb2) {
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
-                     .addReg(destlo, RegState::Define)
-                     .addReg(desthi, RegState::Define)
-                     .addReg(ptr));
-    } else {
-      unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
-                     .addReg(GPRPair0, RegState::Define).addReg(ptr));
-      // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
-        .addReg(GPRPair0, 0, ARM::gsub_0);
-      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
-        .addReg(GPRPair0, 0, ARM::gsub_1);
-    }
-  }
-
-  unsigned StoreLo, StoreHi;
-  if (IsCmpxchg) {
-    // Add early exit
-    for (unsigned i = 0; i < 2; i++) {
-      AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
-                                                         ARM::CMPrr))
-                     .addReg(i == 0 ? destlo : desthi)
-                     .addReg(i == 0 ? vallo : valhi));
-      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-        .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
-      BB->addSuccessor(exitMBB);
-      BB->addSuccessor(i == 0 ? contBB : cont2BB);
-      BB = (i == 0 ? contBB : cont2BB);
-    }
-
-    // Copy to physregs for strexd
-    StoreLo = MI->getOperand(5).getReg();
-    StoreHi = MI->getOperand(6).getReg();
-  } else if (Op1) {
-    // Perform binary operation
-    unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo)
-                   .addReg(destlo).addReg(vallo))
-        .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
-    unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
-                   .addReg(desthi).addReg(valhi))
-        .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
-
-    StoreLo = tmpRegLo;
-    StoreHi = tmpRegHi;
-  } else {
-    // Copy to physregs for strexd
-    StoreLo = vallo;
-    StoreHi = valhi;
-  }
-  if (IsMinMax) {
-    // Compare and branch to exit block.
-    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-      .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
-    BB->addSuccessor(exitMBB);
-    BB->addSuccessor(contBB);
-    BB = contBB;
-    StoreLo = vallo;
-    StoreHi = valhi;
-  }
-
-  // Store
-  if (isThumb2) {
-    MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
-                   .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
-  } else {
-    // Marshal a pair...
-    unsigned StorePair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
-      .addReg(UndefPair)
-      .addReg(StoreLo)
-      .addImm(ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), StorePair)
-      .addReg(r1)
-      .addReg(StoreHi)
-      .addImm(ARM::gsub_1);
-
-    // ...and store it
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
-                   .addReg(StorePair).addReg(ptr));
-  }
-  // Cmp+jump
-  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                 .addReg(storesuccess).addImm(0));
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
-
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARMTargetLowering::EmitAtomicLoad64(MachineInstr *MI, MachineBasicBlock *BB) const {
-
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  unsigned destlo = MI->getOperand(0).getReg();
-  unsigned desthi = MI->getOperand(1).getReg();
-  unsigned ptr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-  bool isThumb2 = Subtarget->isThumb2();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  if (isThumb2) {
-    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
-    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
-  }
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
-
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(ldrOpc));
-
-  if (isThumb2) {
-    MIB.addReg(destlo, RegState::Define)
-       .addReg(desthi, RegState::Define)
-       .addReg(ptr);
-
-  } else {
-    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    MIB.addReg(GPRPair0, RegState::Define).addReg(ptr);
-
-    // Copy GPRPair0 into dest.  (This copy will normally be coalesced.)
-    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), destlo)
-      .addReg(GPRPair0, 0, ARM::gsub_0);
-    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), desthi)
-      .addReg(GPRPair0, 0, ARM::gsub_1);
-  }
-  AddDefaultPred(MIB);
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
-}
-
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
 void ARMTargetLowering::
@@ -7493,8 +6858,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Load an immediate to varEnd.
@@ -7670,131 +7034,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MI->eraseFromParent();
     return BB;
   }
-  case ARM::ATOMIC_LOAD_ADD_I8:
-     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
-  case ARM::ATOMIC_LOAD_ADD_I16:
-     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
-  case ARM::ATOMIC_LOAD_ADD_I32:
-     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
-
-  case ARM::ATOMIC_LOAD_AND_I8:
-     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-  case ARM::ATOMIC_LOAD_AND_I16:
-     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-  case ARM::ATOMIC_LOAD_AND_I32:
-     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-
-  case ARM::ATOMIC_LOAD_OR_I8:
-     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-  case ARM::ATOMIC_LOAD_OR_I16:
-     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-  case ARM::ATOMIC_LOAD_OR_I32:
-     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-
-  case ARM::ATOMIC_LOAD_XOR_I8:
-     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-  case ARM::ATOMIC_LOAD_XOR_I16:
-     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-  case ARM::ATOMIC_LOAD_XOR_I32:
-     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-
-  case ARM::ATOMIC_LOAD_NAND_I8:
-     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
-  case ARM::ATOMIC_LOAD_NAND_I16:
-     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
-  case ARM::ATOMIC_LOAD_NAND_I32:
-     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
-
-  case ARM::ATOMIC_LOAD_SUB_I8:
-     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
-  case ARM::ATOMIC_LOAD_SUB_I16:
-     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
-  case ARM::ATOMIC_LOAD_SUB_I32:
-     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
-
-  case ARM::ATOMIC_LOAD_MIN_I8:
-     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
-  case ARM::ATOMIC_LOAD_MIN_I16:
-     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
-  case ARM::ATOMIC_LOAD_MIN_I32:
-     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
-
-  case ARM::ATOMIC_LOAD_MAX_I8:
-     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
-  case ARM::ATOMIC_LOAD_MAX_I16:
-     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
-  case ARM::ATOMIC_LOAD_MAX_I32:
-     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
-
-  case ARM::ATOMIC_LOAD_UMIN_I8:
-     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
-  case ARM::ATOMIC_LOAD_UMIN_I16:
-     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
-  case ARM::ATOMIC_LOAD_UMIN_I32:
-     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
-
-  case ARM::ATOMIC_LOAD_UMAX_I8:
-     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
-  case ARM::ATOMIC_LOAD_UMAX_I16:
-     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
-  case ARM::ATOMIC_LOAD_UMAX_I32:
-     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
-
-  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
-  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
-  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
-
-  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
-  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
-  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
-
-  case ARM::ATOMIC_LOAD_I64:
-    return EmitAtomicLoad64(MI, BB);
-
-  case ARM::ATOMIC_LOAD_ADD_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
-                              isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
-                              /*NeedsCarry*/ true);
-  case ARM::ATOMIC_LOAD_SUB_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
-                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
-                              /*NeedsCarry*/ true);
-  case ARM::ATOMIC_LOAD_OR_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
-                              isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-  case ARM::ATOMIC_LOAD_XOR_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
-                              isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-  case ARM::ATOMIC_LOAD_AND_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
-                              isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-  case ARM::ATOMIC_STORE_I64:
-  case ARM::ATOMIC_SWAP_I64:
-    return EmitAtomicBinary64(MI, BB, 0, 0, false);
-  case ARM::ATOMIC_CMP_SWAP_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
-                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
-                              /*NeedsCarry*/ false, /*IsCmpxchg*/true);
-  case ARM::ATOMIC_LOAD_MIN_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
-                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
-                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
-                              /*IsMinMax*/ true, ARMCC::LT);
-  case ARM::ATOMIC_LOAD_MAX_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
-                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
-                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
-                              /*IsMinMax*/ true, ARMCC::GE);
-  case ARM::ATOMIC_LOAD_UMIN_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
-                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
-                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
-                              /*IsMinMax*/ true, ARMCC::LO);
-  case ARM::ATOMIC_LOAD_UMAX_I64:
-    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
-                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
-                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
-                              /*IsMinMax*/ true, ARMCC::HS);
 
   case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -7820,8 +7059,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     BB->addSuccessor(copy0MBB);
@@ -7854,7 +7092,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::BCCi64:
   case ARM::BCCZi64: {
     // If there is an unconditional branch to the other successor, remove it.
-    BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+    BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
 
     // Compare both parts that make up the double comparison separately for
     // equality.
@@ -7939,8 +7177,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     SinkBB->splice(SinkBB->begin(), BB,
-      llvm::next(MachineBasicBlock::iterator(MI)),
-      BB->end());
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
 
     BB->addSuccessor(RSBBB);
@@ -8273,7 +7510,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   // Get widened type and narrowed type.
   MVT widenType;
   unsigned numElem = VT.getVectorNumElements();
-  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  
+  EVT inputLaneType = Vec.getValueType().getVectorElementType();
+  switch (inputLaneType.getSimpleVT().SimpleTy) {
     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
@@ -8283,7 +7522,8 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
 
   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                             widenType, &Ops[0], Ops.size());
-  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp);
+  unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
+  return DAG.getNode(ExtOp, SDLoc(N), VT, tmp);
 }
 
 static SDValue findMUL_LOHI(SDValue V) {
@@ -9759,9 +8999,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
   // loads from a constant pool.
   case Intrinsic::arm_neon_vshifts:
   case Intrinsic::arm_neon_vshiftu:
-  case Intrinsic::arm_neon_vshiftls:
-  case Intrinsic::arm_neon_vshiftlu:
-  case Intrinsic::arm_neon_vshiftn:
   case Intrinsic::arm_neon_vrshifts:
   case Intrinsic::arm_neon_vrshiftu:
   case Intrinsic::arm_neon_vrshiftn:
@@ -9792,12 +9029,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       }
       return SDValue();
 
-    case Intrinsic::arm_neon_vshiftls:
-    case Intrinsic::arm_neon_vshiftlu:
-      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
-        break;
-      llvm_unreachable("invalid shift count for vshll intrinsic");
-
     case Intrinsic::arm_neon_vrshifts:
     case Intrinsic::arm_neon_vrshiftu:
       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
@@ -9815,7 +9046,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
         break;
       llvm_unreachable("invalid shift count for vqshlu intrinsic");
 
-    case Intrinsic::arm_neon_vshiftn:
     case Intrinsic::arm_neon_vrshiftn:
     case Intrinsic::arm_neon_vqshiftns:
     case Intrinsic::arm_neon_vqshiftnu:
@@ -9838,16 +9068,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
     case Intrinsic::arm_neon_vshiftu:
       // Opcode already set above.
       break;
-    case Intrinsic::arm_neon_vshiftls:
-    case Intrinsic::arm_neon_vshiftlu:
-      if (Cnt == VT.getVectorElementType().getSizeInBits())
-        VShiftOpc = ARMISD::VSHLLi;
-      else
-        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
-                     ARMISD::VSHLLs : ARMISD::VSHLLu);
-      break;
-    case Intrinsic::arm_neon_vshiftn:
-      VShiftOpc = ARMISD::VSHRN; break;
     case Intrinsic::arm_neon_vrshifts:
       VShiftOpc = ARMISD::VRSHRs; break;
     case Intrinsic::arm_neon_vrshiftu:
@@ -10211,7 +9431,8 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
 }
 
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned,
+                                                      bool *Fast) const {
   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
 
@@ -10233,7 +9454,7 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const
   case MVT::v2f64: {
     // For any little-endian targets with neon, we can support unaligned ld/st
     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
-    // A big-endian target may also explictly support unaligned accesses
+    // A big-endian target may also explicitly support unaligned accesses
     if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
       if (Fast)
         *Fast = true;
@@ -10265,11 +9486,11 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
     bool Fast;
     if (Size >= 16 &&
         (memOpAlign(SrcAlign, DstAlign, 16) ||
-         (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
+         (allowsUnalignedMemoryAccesses(MVT::v2f64, 0, &Fast) && Fast))) {
       return MVT::v2f64;
     } else if (Size >= 8 &&
                (memOpAlign(SrcAlign, DstAlign, 8) ||
-                (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
+                (allowsUnalignedMemoryAccesses(MVT::f64, 0, &Fast) && Fast))) {
       return MVT::f64;
     }
   }
@@ -10743,6 +9964,20 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
     KnownOne  &= KnownOneRHS;
     return;
   }
+  case ISD::INTRINSIC_W_CHAIN: {
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::arm_ldaex:
+    case Intrinsic::arm_ldrex: {
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+  }
   }
 }
 
@@ -11191,6 +10426,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -11203,6 +10439,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = false;
     return true;
   }
+  case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -11215,6 +10452,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_stlexd:
   case Intrinsic::arm_strexd: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
@@ -11226,6 +10464,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_ldaexd:
   case Intrinsic::arm_ldrexd: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
@@ -11243,3 +10482,15 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
   return false;
 }
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return false;
+  return true;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 90facdd..f33e6db 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -15,17 +15,15 @@
 #ifndef ARMISELLOWERING_H
 #define ARMISELLOWERING_H
 
-#include "ARM.h"
-#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include <vector>
 
 namespace llvm {
   class ARMConstantPoolValue;
+  class ARMSubtarget;
 
   namespace ARMISD {
     // ARM Specific DAG Nodes
@@ -35,8 +33,6 @@ namespace llvm {
 
       Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
                     // TargetExternalSymbol, and TargetGlobalAddress.
-      WrapperDYN,   // WrapperDYN - A wrapper node for TargetGlobalAddress in
-                    // DYN mode.
       WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
                     // PIC mode.
       WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
@@ -115,10 +111,6 @@ namespace llvm {
       VSHL,         // ...left
       VSHRs,        // ...right (signed)
       VSHRu,        // ...right (unsigned)
-      VSHLLs,       // ...left long (signed)
-      VSHLLu,       // ...left long (unsigned)
-      VSHLLi,       // ...left long (with maximum shift count)
-      VSHRN,        // ...right narrow
 
       // Vector rounding shift by immediate:
       VRSHRs,       // ...right (signed)
@@ -240,116 +232,114 @@ namespace llvm {
   public:
     explicit ARMTargetLowering(TargetMachine &TM);
 
-    virtual unsigned getJumpTableEncoding() const;
+    unsigned getJumpTableEncoding() const override;
 
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    virtual bool isSelectSupported(SelectSupportKind Kind) const {
+    bool isSelectSupported(SelectSupportKind Kind) const override {
       // ARM does not support scalar condition selects on vectors.
       return (Kind != ScalarCondVectorVal);
     }
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    virtual MachineBasicBlock *
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
 
-    virtual void
-    AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
+    void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                       SDNode *Node) const override;
 
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const;
+    bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
 
     /// allowsUnalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses of the specified type. Returns whether it
     /// is "fast" by reference in the second argument.
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                       bool *Fast) const override;
 
-    virtual EVT getOptimalMemOpType(uint64_t Size,
-                                    unsigned DstAlign, unsigned SrcAlign,
-                                    bool IsMemset, bool ZeroMemset,
-                                    bool MemcpyStrSrc,
-                                    MachineFunction &MF) const;
+    EVT getOptimalMemOpType(uint64_t Size,
+                            unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
 
     using TargetLowering::isZExtFree;
-    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
 
-    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
-    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+    bool isLegalICmpImmediate(int64_t Imm) const override;
 
     /// isLegalAddImmediate - Return true if the specified immediate is legal
     /// add immediate, that is the target has add instructions which can
     /// add a register and the immediate without having to materialize
     /// the immediate into a register.
-    virtual bool isLegalAddImmediate(int64_t Imm) const;
+    bool isLegalAddImmediate(int64_t Imm) const override;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
-    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                           SDValue &Offset,
-                                           ISD::MemIndexedMode &AM,
-                                           SelectionDAG &DAG) const;
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
 
     /// getPostIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if this node can be
     /// combined with a load / store to form a post-indexed load / store.
-    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                            SDValue &Base, SDValue &Offset,
-                                            ISD::MemIndexedMode &AM,
-                                            SelectionDAG &DAG) const;
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                    SDValue &Offset, ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
 
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth) const;
+    void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                        APInt &KnownOne,
+                                        const SelectionDAG &DAG,
+                                        unsigned Depth) const override;
 
 
-    virtual bool ExpandInlineAsm(CallInst *CI) const;
+    bool ExpandInlineAsm(CallInst *CI) const override;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
     ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
+      AsmOperandInfo &info, const char *constraint) const override;
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
-    virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              std::string &Constraint,
-                                              std::vector<SDValue> &Ops,
-                                              SelectionDAG &DAG) const;
+    void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
 
     const ARMSubtarget* getSubtarget() const {
       return Subtarget;
@@ -357,39 +347,46 @@ namespace llvm {
 
     /// getRegClassFor - Return the register class that should be used for the
     /// specified value type.
-    virtual const TargetRegisterClass *getRegClassFor(MVT VT) const;
+    const TargetRegisterClass *getRegClassFor(MVT VT) const override;
 
     /// getMaximalGlobalOffset - Returns the maximal possible offset which can
     /// be used for loads / stores from the global.
-    virtual unsigned getMaximalGlobalOffset() const;
+    unsigned getMaximalGlobalOffset() const override;
 
     /// Returns true if a cast between SrcAS and DestAS is a noop.
-    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
       // Addrspacecasts are always noops.
       return true;
     }
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
-    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                                     const TargetLibraryInfo *libInfo) const;
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
 
-    Sched::Preference getSchedulingPreference(SDNode *N) const;
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
 
-    bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
-    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    bool
+    isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                            const CallInst &I,
+                            unsigned Intrinsic) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
 
-    virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                    const CallInst &I,
-                                    unsigned Intrinsic) const;
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
-    findRepresentativeClass(MVT VT) const;
+    findRepresentativeClass(MVT VT) const override;
 
   private:
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
@@ -466,7 +463,7 @@ namespace llvm {
     /// lower a pair of fmul and fadd to the latter so it's not clear that there
     /// would be a gain or that the gain would be worthwhile enough to risk
     /// correctness bugs.
-    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; }
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
@@ -477,12 +474,12 @@ namespace llvm {
                             SmallVectorImpl<SDValue> &InVals,
                             bool isThisReturn, SDValue ThisVal) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
     int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
                        SDLoc dl, SDValue &Chain,
@@ -491,11 +488,14 @@ namespace llvm {
                        unsigned OffsetFromOrigArg,
                        unsigned ArgOffset,
                        unsigned ArgSize,
-                       bool ForceMutable) const;
+                       bool ForceMutable,
+                       unsigned ByValStoreOffset,
+                       unsigned TotalArgRegsSaveSize) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                               SDLoc dl, SDValue &Chain,
                               unsigned ArgOffset,
+                              unsigned TotalArgRegsSaveSize,
                               bool ForceMutable = false) const;
 
     void computeRegArea(CCState &CCInfo, MachineFunction &MF,
@@ -504,12 +504,12 @@ namespace llvm {
                         unsigned &ArgRegsSize,
                         unsigned &ArgRegsSaveSize) const;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
-    virtual void HandleByVal(CCState *, unsigned &, unsigned) const;
+    void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
@@ -524,21 +524,21 @@ namespace llvm {
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                            SelectionDAG& DAG) const;
 
-    virtual bool CanLowerReturn(CallingConv::ID CallConv,
-                                MachineFunction &MF, bool isVarArg,
-                                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                LLVMContext &Context) const;
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
 
-    virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
-    virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
@@ -548,29 +548,6 @@ namespace llvm {
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
 
-    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned Size) const;
-    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
-                                        MachineBasicBlock *BB,
-                                        unsigned Size,
-                                        unsigned BinOpcode) const;
-    MachineBasicBlock *EmitAtomicBinary64(MachineInstr *MI,
-                                          MachineBasicBlock *BB,
-                                          unsigned Op1,
-                                          unsigned Op2,
-                                          bool NeedsCarry = false,
-                                          bool IsCmpxchg = false,
-                                          bool IsMinMax = false,
-                                          ARMCC::CondCodes CC = ARMCC::AL) const;
-    MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI,
-                                               MachineBasicBlock *BB,
-                                               unsigned Size,
-                                               bool signExtend,
-                                               ARMCC::CondCodes Cond) const;
-    MachineBasicBlock *EmitAtomicLoad64(MachineInstr *MI,
-                                        MachineBasicBlock *BB) const;
-
     void SetupEntryBlockForSjLj(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index f93504f..aafff98 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -212,25 +212,25 @@ def msr_mask : Operand<i32> {
 //     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
 //     64       64 - <imm> is encoded in imm6<5:0>
 def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
-def shr_imm8  : Operand<i32> {
+def shr_imm8  : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
   let EncoderMethod = "getShiftRight8Imm";
   let DecoderMethod = "DecodeShiftRight8Imm";
   let ParserMatchClass = shr_imm8_asm_operand;
 }
 def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
-def shr_imm16 : Operand<i32> {
+def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
   let EncoderMethod = "getShiftRight16Imm";
   let DecoderMethod = "DecodeShiftRight16Imm";
   let ParserMatchClass = shr_imm16_asm_operand;
 }
 def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
-def shr_imm32 : Operand<i32> {
+def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
   let EncoderMethod = "getShiftRight32Imm";
   let DecoderMethod = "DecodeShiftRight32Imm";
   let ParserMatchClass = shr_imm32_asm_operand;
 }
 def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
-def shr_imm64 : Operand<i32> {
+def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
   let EncoderMethod = "getShiftRight64Imm";
   let DecoderMethod = "DecodeShiftRight64Imm";
   let ParserMatchClass = shr_imm64_asm_operand;
@@ -329,10 +329,10 @@ class InstThumb<AddrMode am, int sz, IndexMode im,
 // Pseudo-instructions for alternate assembly syntax (never used by codegen).
 // These are aliases that require C++ handling to convert to the target
 // instruction, while InstAliases can be handled directly by tblgen.
-class AsmPseudoInst<string asm, dag iops>
+class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
   : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
                  "", NoItinerary> {
-  let OutOperandList = (outs);
+  let OutOperandList = oops;
   let InOperandList = iops;
   let Pattern = [];
   let isCodeGenOnly = 0; // So we get asm matcher for it.
@@ -340,16 +340,16 @@ class AsmPseudoInst<string asm, dag iops>
   let isPseudo = 1;
 }
 
-class ARMAsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
-        Requires<[IsARM]>;
-class tAsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
-        Requires<[IsThumb]>;
-class t2AsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
-        Requires<[IsThumb2]>;
-class VFP2AsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
-        Requires<[HasVFP2]>;
-class NEONAsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
-        Requires<[HasNEON]>;
+class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsARM]>;
+class tAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb]>;
+class t2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb2]>;
+class VFP2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[HasVFP2]>;
+class NEONAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[HasNEON]>;
 
 // Pseudo instructions for the code generator.
 class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
@@ -477,6 +477,10 @@ class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
           string asm, list<dag> pattern>
   : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
        asm, "", pattern>;
+class AXIM<dag oops, dag iops, AddrMode am, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, am, 4, IndexModeNone, f, itin,
+       asm, "", pattern>;
 class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index df867b4..f235ac2 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -97,7 +97,7 @@ namespace {
     static char ID;
     ARMCGBR() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
       if (AFI->getGlobalBaseReg() == 0)
         return false;
@@ -146,11 +146,11 @@ namespace {
       return true;
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM PIC Global Base Reg Initialization";
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 5d3e059..b09958a 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -14,10 +14,8 @@
 #ifndef ARMINSTRUCTIONINFO_H
 #define ARMINSTRUCTIONINFO_H
 
-#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
-#include "ARMSubtarget.h"
 
 namespace llvm {
   class ARMSubtarget;
@@ -28,17 +26,17 @@ public:
   explicit ARMInstrInfo(const ARMSubtarget &STI);
 
   /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
-  unsigned getUnindexedOpcode(unsigned Opc) const;
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ARMRegisterInfo &getRegisterInfo() const { return RI; }
+  const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
 };
 
 }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 2042c04..75a109e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -95,7 +95,6 @@ def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
 
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
-def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
 
@@ -187,7 +186,8 @@ def ARMvminnm        : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
 def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
                                  AssemblerPredicate<"HasV4TOps", "armv4t">;
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
-def HasV5T           : Predicate<"Subtarget->hasV5TOps()">;
+def HasV5T           : Predicate<"Subtarget->hasV5TOps()">,
+                                 AssemblerPredicate<"HasV5TOps", "armv5t">;
 def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
                                  AssemblerPredicate<"HasV5TEOps", "armv5te">;
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
@@ -244,6 +244,7 @@ def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
 def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
                                  AssemblerPredicate<"FeatureTrustZone",
                                                     "TrustZone">;
+def HasZCZ           : Predicate<"Subtarget->hasZeroCycleZeroing()">;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
 def IsThumb          : Predicate<"Subtarget->isThumb()">,
@@ -261,6 +262,8 @@ def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb", "arm-mode">;
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
 def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
+def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
+def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
 def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
                                  AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
@@ -276,7 +279,8 @@ def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 // But only select them if more precision in FP computation is allowed.
 // Do not use them for Darwin platforms.
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast) && "
+                                 " FPOpFusion::Fast && "
+                                 " Subtarget->hasVFP4()) && "
                                  "!Subtarget->isTargetDarwin()">;
 def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast &&"
@@ -590,7 +594,7 @@ def so_imm2part : PatLeaf<(imm), [{
 /// arm_i32imm - True for +V6T2, or true only if so_imm2part is true.
 ///
 def arm_i32imm : PatLeaf<(imm), [{
-  if (Subtarget->hasV6T2Ops())
+  if (Subtarget->useMovt())
     return true;
   return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
 }]>;
@@ -1725,6 +1729,8 @@ def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{31-28} = 0xe; // AL
   let Inst{7-4} = 0b0111;
 }
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (BKPT 0)>, Requires<[IsARM]>;
 
 def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
                  "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
@@ -1770,8 +1776,8 @@ let imod = 0, iflags = 0, M = 1 in
 // Preload signals the memory system of possible future data/instruction access.
 multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
-  def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
-                !strconcat(opc, "\t$addr"),
+  def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm,
+                IIC_Preload, !strconcat(opc, "\t$addr"),
                 [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
                 Sched<[WritePreLd]> {
     bits<4> Rt;
@@ -2272,11 +2278,16 @@ def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
                    [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-// Load doubleword
-def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
-                 (ins addrmode3:$addr), LdMiscFrm,
-                 IIC_iLoad_d_r, "ldrd", "\t$Rd, $dst2, $addr",
-                 []>, Requires<[IsARM, HasV5TE]>;
+  // Load doubleword
+  def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
+                   LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
+             Requires<[IsARM, HasV5TE]>;
+
+  // GNU Assembler extension (compatibility)
+  let isAsmParserOnly = 1 in
+    def LDRD_PAIR : AI3ld<0b1101, 0, (outs GPRPairOp:$Rt), (ins addrmode3:$addr),
+                          LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $addr", []>,
+                    Requires<[IsARM, HasV5TE]>;
 }
 
 def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
@@ -2441,11 +2452,11 @@ def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
-def LDRT_POST_IMM : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
-                    (ins addr_offset_none:$addr, am2offset_imm:$offset),
-                   IndexModePost, LdFrm, IIC_iLoad_ru,
-                   "ldrt", "\t$Rt, $addr, $offset",
-                   "$addr.base = $Rn_wb", []> {
+def LDRT_POST_IMM
+  : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+               (ins addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, LdFrm, IIC_iLoad_ru,
+               "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
   bits<14> offset;
@@ -2477,11 +2488,11 @@ def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
-def LDRBT_POST_IMM : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
-                     (ins addr_offset_none:$addr, am2offset_imm:$offset),
-                    IndexModePost, LdFrm, IIC_iLoad_bh_ru,
-                    "ldrbt", "\t$Rt, $addr, $offset",
-                    "$addr.base = $Rn_wb", []> {
+def LDRBT_POST_IMM
+  : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+               (ins addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+               "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
   bits<14> offset;
@@ -2524,6 +2535,14 @@ defm LDRHT  : AI3ldrT<0b1011, "ldrht">;
 defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
 }
 
+def LDRT_POST
+  : ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+                 (outs GPR:$Rt)>;
+
+def LDRBT_POST
+  : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+                 (outs GPR:$Rt)>;
+
 // Store
 
 // Stores with truncate
@@ -2532,12 +2551,20 @@ def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
                [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
 
 // Store doubleword
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
-def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr),
-               StMiscFrm, IIC_iStore_d_r,
-               "strd", "\t$Rt, $src2, $addr", []>,
-           Requires<[IsARM, HasV5TE]> {
-  let Inst{21} = 0;
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+  def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+                    StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>,
+             Requires<[IsARM, HasV5TE]> {
+    let Inst{21} = 0;
+  }
+
+  // GNU Assembler extension (compatibility)
+  let isAsmParserOnly = 1 in
+    def STRD_PAIR : AI3str<0b1111, (outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
+                           StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $addr", []>,
+                    Requires<[IsARM, HasV5TE]> {
+      let Inst{21} = 0;
+    }
 }
 
 // Indexed stores
@@ -2746,11 +2773,11 @@ def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
-def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
-                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
-                   IndexModePost, StFrm, IIC_iStore_bh_ru,
-                   "strbt", "\t$Rt, $addr, $offset",
-                   "$addr.base = $Rn_wb", []> {
+def STRBT_POST_IMM
+  : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+               (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, StFrm, IIC_iStore_bh_ru,
+               "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
   bits<14> offset;
@@ -2763,6 +2790,10 @@ def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
+def STRBT_POST
+  : ARMAsmPseudo<"strbt${q} $Rt, $addr",
+                 (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
 let mayStore = 1, neverHasSideEffects = 1 in {
 def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
                    (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
@@ -2783,11 +2814,11 @@ def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
   let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
-def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
-                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
-                   IndexModePost, StFrm, IIC_iStore_ru,
-                   "strt", "\t$Rt, $addr, $offset",
-                   "$addr.base = $Rn_wb", []> {
+def STRT_POST_IMM
+  : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+               (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, StFrm, IIC_iStore_ru,
+               "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
   bits<14> offset;
@@ -2801,6 +2832,9 @@ def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
 }
 }
 
+def STRT_POST
+  : ARMAsmPseudo<"strt${q} $Rt, $addr",
+                 (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
 
 multiclass AI3strT<bits<4> op, string opc> {
   def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
@@ -3600,21 +3634,22 @@ def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
                Requires<[IsARM, NoV6, UseMulOps]>;
 }
 
-def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+def MLA  : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
+                     (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
                      IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
-                   [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
-                   Requires<[IsARM, HasV6, UseMulOps]> {
+        [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
+                     Requires<[IsARM, HasV6, UseMulOps]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
 }
 
 let Constraints = "@earlyclobber $Rd" in
-def MLAv5: ARMPseudoExpand<(outs GPR:$Rd),
-                           (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
-                           4, IIC_iMAC32,
-                        [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))],
-                  (MLA GPR:$Rd, GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s)>,
-                        Requires<[IsARM, NoV6]>;
+def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
+                           (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+                            pred:$p, cc_out:$s), 4, IIC_iMAC32,
+         [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
+  (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
 
 def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                    IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
@@ -3682,7 +3717,8 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
   let Inst{3-0}   = Rn;
 }
 
-let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in {
+let Constraints =
+    "@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in {
 def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                 (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
@@ -3697,14 +3733,6 @@ def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                            Requires<[IsARM, NoV6]>;
 }
 
-let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
-def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p),
-                              4, IIC_iMAC64, [],
-          (UMAAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p)>,
-                           Requires<[IsARM, NoV6]>;
-}
-
 } // neverHasSideEffects
 
 // Most significant word multiply
@@ -4308,214 +4336,6 @@ let usesCustomInserter = 1, Defs = [CPSR] in {
 // Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
   def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
-
-// Atomic pseudo-insts which will be lowered to ldrex/strex loops.
-// (64-bit pseudos use a hand-written selection code).
-  let mayLoad = 1, mayStore = 1 in {
-    def ATOMIC_LOAD_ADD_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_SUB_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_AND_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_OR_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_XOR_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_NAND_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MIN_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MAX_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_ADD_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_SUB_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_AND_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_OR_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_XOR_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_NAND_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MIN_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MAX_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_ADD_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_SUB_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_AND_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_OR_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_XOR_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_NAND_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MIN_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MAX_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_CMP_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst),
-      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_ADD_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_SUB_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_AND_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_OR_I64 :  PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_XOR_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_NAND_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MIN_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_MAX_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMIN_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_LOAD_UMAX_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_SWAP_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
-    def ATOMIC_CMP_SWAP_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
-           GPR:$set1, GPR:$set2, i32imm:$ordering),
-      NoItinerary, []>;
-  }
-  let mayLoad = 1 in
-    def ATOMIC_LOAD_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, i32imm:$ordering),
-      NoItinerary, []>;
-  let mayStore = 1 in
-    def ATOMIC_STORE_I64 : PseudoInst<
-      (outs GPR:$dst1, GPR:$dst2),
-      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
-      NoItinerary, []>;
 }
 
 let usesCustomInserter = 1 in {
@@ -4552,6 +4372,33 @@ def strex_4 : PatFrag<(ops node:$val, node:$ptr),
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
+def ldaex_1 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaex_2 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaex_4 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary, "ldrexb", "\t$Rt, $addr",
@@ -4569,11 +4416,14 @@ def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
 }
 
 def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldaexb", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldaexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>;
 def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldaexh", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldaexh", "\t$Rt, $addr",
+                    [(set GPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>;
 def LDAEX  : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldaex", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldaex", "\t$Rt, $addr",
+                    [(set GPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>;
 let hasExtraDefRegAllocReq = 1 in
 def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
                       NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
@@ -4584,13 +4434,16 @@ def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
 def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
-                    [(set GPR:$Rd, (strex_1 GPR:$Rt, addr_offset_none:$addr))]>;
+                    [(set GPR:$Rd, (strex_1 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
 def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
-                    [(set GPR:$Rd, (strex_2 GPR:$Rt, addr_offset_none:$addr))]>;
+                    [(set GPR:$Rd, (strex_2 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
 def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strex", "\t$Rd, $Rt, $addr",
-                    [(set GPR:$Rd, (strex_4 GPR:$Rt, addr_offset_none:$addr))]>;
+                    [(set GPR:$Rd, (strex_4 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
 let hasExtraSrcRegAllocReq = 1 in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     (ins GPRPairOp:$Rt, addr_offset_none:$addr),
@@ -4599,13 +4452,16 @@ def STREXD : AIstrex<0b01, (outs GPR:$Rd),
 }
 def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
-                    []>;
+                    [(set GPR:$Rd,
+                          (stlex_1 GPR:$Rt, addr_offset_none:$addr))]>;
 def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
-                    []>;
+                    [(set GPR:$Rd,
+                          (stlex_2 GPR:$Rt, addr_offset_none:$addr))]>;
 def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
-                    []>;
+                    [(set GPR:$Rd,
+                          (stlex_4 GPR:$Rt, addr_offset_none:$addr))]>;
 let hasExtraSrcRegAllocReq = 1 in
 def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
                     (ins GPRPairOp:$Rt, addr_offset_none:$addr),
@@ -4620,15 +4476,16 @@ def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
   let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
-def : ARMPat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
-             (LDREXB addr_offset_none:$addr)>;
-def : ARMPat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
-             (LDREXH addr_offset_none:$addr)>;
 def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
              (STREXB GPR:$Rt, addr_offset_none:$addr)>;
 def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
              (STREXH GPR:$Rt, addr_offset_none:$addr)>;
 
+def : ARMPat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
@@ -5183,6 +5040,10 @@ def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
                            [(set GPR:$dst, (arm_i32imm:$src))]>,
                            Requires<[IsARM]>;
 
+def LDRLIT_ga_abs : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iLoad_i,
+                               [(set GPR:$dst, (ARMWrapper tglobaladdr:$src))]>,
+                    Requires<[IsARM, DontUseMovt]>;
+
 // Pseudo instruction that combines movw + movt + add pc (if PIC).
 // It also makes it possible to rematerialize the instructions.
 // FIXME: Remove this when we can do generalized remat and when machine licm
@@ -5193,10 +5054,17 @@ def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
                         [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
                         Requires<[IsARM, UseMovt]>;
 
-def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
-                             IIC_iMOVix2,
-                        [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
-                        Requires<[IsARM, UseMovt]>;
+def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                 IIC_iLoadiALU,
+                                 [(set GPR:$dst,
+                                       (ARMWrapperPIC tglobaladdr:$addr))]>,
+                      Requires<[IsARM, DontUseMovt]>;
+
+def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              NoItinerary,
+                              [(set GPR:$dst,
+                                    (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                          Requires<[IsARM, DontUseMovt]>;
 
 let AddedComplexity = 10 in
 def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
@@ -5206,8 +5074,6 @@ def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
 } // isReMaterializable
 
 // ConstantPool, GlobalAddress, and JumpTable
-def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
-            Requires<[IsARM, DontUseMovt]>;
 def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
             Requires<[IsARM, UseMovt]>;
@@ -5543,9 +5409,22 @@ def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
 def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
          Requires<[IsARM, NoV6]>;
 
-// UMULL/SMULL are available on all arches, but the instruction definitions
-// need difference constraints pre-v6. Use these aliases for the assembly
-// parsing on pre-v6.
+// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
+// the instruction definitions need difference constraints pre-v6.
+// Use these aliases for the assembly parsing on pre-v6.
+def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
+            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
+            (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+             pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
 def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
             (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
          Requires<[IsARM, NoV6]>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 43bd4c2..0d46c49 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -466,9 +466,6 @@ def SDTARMVSHINS  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
 def NEONvshl      : SDNode<"ARMISD::VSHL", SDTARMVSH>;
 def NEONvshrs     : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
 def NEONvshru     : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
-def NEONvshlls    : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>;
-def NEONvshllu    : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>;
-def NEONvshlli    : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>;
 def NEONvshrn     : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
 
 def NEONvrshrs    : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
@@ -730,6 +727,8 @@ defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32">;
 defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64">;
 
 def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
 
 // ...with 4 registers
 class VLD1D4<bits<4> op7_4, string Dt>
@@ -769,6 +768,8 @@ defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32">;
 defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64">;
 
 def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1671,7 +1672,7 @@ defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32">;
 defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64">;
 
 def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
-def VST1d64TPseudoWB_fixed    : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
 def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
 
 // ...with 4 registers
@@ -1714,7 +1715,7 @@ defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32">;
 defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64">;
 
 def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
-def VST1d64QPseudoWB_fixed    : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
 def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
@@ -3034,22 +3035,23 @@ class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 // Long shift by immediate.
 class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              string OpcodeStr, string Dt,
-             ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode>
+             ValueType ResTy, ValueType OpTy, Operand ImmTy,
+             SDPatternOperator OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
            (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm,
            IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
-           [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm),
-                                          (i32 imm:$SIMM))))]>;
+           [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), ImmTy:$SIMM)))]>;
 
 // Narrow shift by immediate.
 class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              InstrItinClass itin, string OpcodeStr, string Dt,
-             ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode>
+             ValueType ResTy, ValueType OpTy, Operand ImmTy,
+             SDPatternOperator OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
            (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
-                                          (i32 imm:$SIMM))))]>;
+                                          (i32 ImmTy:$SIMM))))]>;
 
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
@@ -3937,7 +3939,8 @@ multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 // Neon Shift Long operations,
 //   element sizes of 8, 16, 32 bits:
 multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
-                      bit op4, string OpcodeStr, string Dt, SDNode OpNode> {
+                      bit op4, string OpcodeStr, string Dt,
+                      SDPatternOperator OpNode> {
   def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
               OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
@@ -3956,7 +3959,7 @@ multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
 //   element sizes of 16, 32, 64 bits:
 multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
                       bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
-                      SDNode OpNode> {
+                      SDPatternOperator OpNode> {
   def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
                     OpcodeStr, !strconcat(Dt, "16"),
                     v8i8, v8i16, shr_imm8, OpNode> {
@@ -4423,14 +4426,14 @@ defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
 def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
-                        "f32", v2i32, v2f32, int_arm_neon_vacged, 0>;
+                        "f32", v2i32, v2f32, int_arm_neon_vacge, 0>;
 def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
-                        "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>;
+                        "f32", v4i32, v4f32, int_arm_neon_vacge, 0>;
 //   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
 def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
-                        "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>;
+                        "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>;
 def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
-                        "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>;
+                        "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>;
 //   VTST     : Vector Test Bits
 defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                         IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
@@ -4942,28 +4945,51 @@ defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
                             NEONvshru>;
 
 //   VSHLL    : Vector Shift Left Long
-defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
-defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", NEONvshllu>;
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s",
+  PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u",
+  PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>;
 
 //   VSHLL    : Vector Shift Left Long (with maximum shift count)
 class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
                 bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
-                ValueType OpTy, Operand ImmTy, SDNode OpNode>
+                ValueType OpTy, Operand ImmTy>
   : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
-           ResTy, OpTy, ImmTy, OpNode> {
+           ResTy, OpTy, ImmTy, null_frag> {
   let Inst{21-16} = op21_16;
   let DecoderMethod = "DecodeVSHLMaxInstruction";
 }
 def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
-                          v8i16, v8i8, imm8, NEONvshlli>;
+                          v8i16, v8i8, imm8>;
 def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
-                          v4i32, v4i16, imm16, NEONvshlli>;
+                          v4i32, v4i16, imm16>;
 def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
-                          v2i64, v2i32, imm32, NEONvshlli>;
+                          v2i64, v2i32, imm32>;
+
+def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
+def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
 
 //   VSHRN    : Vector Shift Right and Narrow
 defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
-                           NEONvshrn>;
+                           PatFrag<(ops node:$Rn, node:$amt),
+                                   (trunc (NEONvshrs node:$Rn, node:$amt))>>;
+
+def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))),
+          (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>;
+def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))),
+          (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>;
+def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))),
+          (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>;
 
 //   VRSHL    : Vector Rounding Shift
 defm VRSHLs   : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
@@ -5073,9 +5099,6 @@ def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))),
                (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))),
           (VABSv4i32 QPR:$src)>;
 
-def : Pat<(v2f32 (int_arm_neon_vabs (v2f32 DPR:$src))), (VABSfd DPR:$src)>;
-def : Pat<(v4f32 (int_arm_neon_vabs (v4f32 QPR:$src))), (VABSfq QPR:$src)>;
-
 //   VQABS    : Vector Saturating Absolute Value
 defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
                            IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
@@ -5222,6 +5245,26 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
                          [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
 } // isReMaterializable
 
+
+// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
+// require zero cycles to execute so they should be used wherever possible for
+// setting a register to zero.
+
+// Even without these pseudo-insts we would probably end up with the correct
+// instruction, but we could not mark the general ones with "isAsCheapAsAMove"
+// since they are sometimes rather expensive (in general).
+
+let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+  def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
+                               [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+                               (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
+               Requires<[HasZCZ]>;
+  def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
+                               [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+                               (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
+               Requires<[HasZCZ]>;
+}
+
 //   VMOV     : Vector Get Lane (move scalar to ARM core register)
 
 def VGETLNs8  : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
@@ -5486,10 +5529,12 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
-def  VDUPfdf : PseudoNeonI<(outs DPR:$dst), (ins SPR:$src), IIC_VMOVD, "",
-                    [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
-def  VDUPfqf : PseudoNeonI<(outs QPR:$dst), (ins SPR:$src), IIC_VMOVD, "",
-                    [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
+          (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                             SPR:$src, ssub_0), (i32 0)))>;
+def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
+          (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                             SPR:$src, ssub_0), (i32 0)))>;
 
 //   VMOVN    : Vector Narrowing Move
 defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
@@ -5870,7 +5915,7 @@ defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
 
 // Cryptography instructions
 let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
-    DecoderNamespace = "v8Crypto" in {
+    DecoderNamespace = "v8Crypto", hasSideEffects = 0 in {
   class AES<string op, bit op7, bit op6, SDPatternOperator Int>
     : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
                  !strconcat("aes", op), "8", v16i8, v16i8, Int>,
@@ -5900,17 +5945,45 @@ def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
 def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
 def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
 
-def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, int_arm_neon_sha1h>;
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, null_frag>;
 def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
 def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
-def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, int_arm_neon_sha1c>;
-def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, int_arm_neon_sha1m>;
-def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, int_arm_neon_sha1p>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, null_frag>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, null_frag>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, null_frag>;
 def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
 def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
 def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
 def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
 
+def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
+          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG
+              (SHA1H (SUBREG_TO_REG (i64 0),
+                                    (f32 (COPY_TO_REGCLASS i32:$Rn, SPR)),
+                                    ssub_0)),
+              ssub_0)), GPR)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1C v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1M v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1P v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index af5ef53..754295f 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -300,6 +300,8 @@ def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
   bits<8> val;
   let Inst{7-0} = val;
 }
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (tBKPT 0)>, Requires<[IsThumb]>;
 
 def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
                 []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
@@ -543,15 +545,15 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                      (tBX GPR:$dst, (ops 14, zero_reg))>,
                      Requires<[IsThumb]>, Sched<[WriteBr]>;
   }
-  // tTAILJMPd: IOS version uses a Thumb2 branch (no Thumb1 tail calls
-  // on IOS), so it's in ARMInstrThumb2.td.
-  // Non-IOS version:
+  // tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls
+  // on MachO), so it's in ARMInstrThumb2.td.
+  // Non-MachO version:
   let Uses = [SP] in {
     def tTAILJMPdND : tPseudoExpand<(outs),
                    (ins t_brtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (tB t_brtarget:$dst, pred:$p)>,
-                 Requires<[IsThumb, IsNotIOS]>, Sched<[WriteBr]>;
+                 Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>;
   }
 }
 
@@ -1306,10 +1308,23 @@ def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
 def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
             (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
 
-// ConstantPool, GlobalAddress
-def : T1Pat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+// ConstantPool
 def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
 
+// GlobalAddress
+def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr),
+                                  IIC_iLoadiALU,
+                                  [(set tGPR:$dst,
+                                        (ARMWrapperPIC tglobaladdr:$addr))]>,
+                       Requires<[IsThumb, DontUseMovt]>;
+
+def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
+                                IIC_iLoad_i,
+                                [(set tGPR:$dst,
+                                      (ARMWrapper tglobaladdr:$src))]>,
+                     Requires<[IsThumb, DontUseMovt]>;
+
+
 // JumpTable
 def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
             (tLEApcrelJT tjumptable:$dst, imm:$id)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 48acffd..387bd60 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3284,15 +3284,18 @@ def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
 def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexb", "\t$Rt, $addr", "",
-                         []>, Requires<[IsThumb, HasV8]>;
+                         [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8]>;
 def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldaexh", "\t$Rt, $addr", "",
-                         []>, Requires<[IsThumb, HasV8]>;
+                         [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8]>;
 def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldaex", "\t$Rt, $addr", "",
-                     []>, Requires<[IsThumb, HasV8]> {
+                         [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8]> {
   bits<4> Rt;
   bits<4> addr;
   let Inst{31-27} = 0b11101;
@@ -3320,21 +3323,21 @@ def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "",
-                         [(set rGPR:$Rd, (strex_1 rGPR:$Rt,
-                                                  addr_offset_none:$addr))]>;
+                         [(set rGPR:$Rd,
+                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>;
 def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "",
-                         [(set rGPR:$Rd, (strex_2 rGPR:$Rt,
-                                                  addr_offset_none:$addr))]>;
+                         [(set rGPR:$Rd,
+                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>;
 
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
-                  [(set rGPR:$Rd, (strex_4 rGPR:$Rt,
-                                           t2addrmode_imm0_1020s4:$addr))]> {
+                  [(set rGPR:$Rd,
+                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<12> addr;
@@ -3358,19 +3361,25 @@ def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "stlexb", "\t$Rd, $Rt, $addr", "",
-                         []>, Requires<[IsThumb, HasV8]>;
+                         [(set rGPR:$Rd,
+                               (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+                         Requires<[IsThumb, HasV8]>;
 
 def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "stlexh", "\t$Rd, $Rt, $addr", "",
-                         []>, Requires<[IsThumb, HasV8]>;
+                         [(set rGPR:$Rd,
+                               (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+                         Requires<[IsThumb, HasV8]>;
 
 def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              addr_offset_none:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "stlex", "\t$Rd, $Rt, $addr", "",
-                  []>, Requires<[IsThumb, HasV8]> {
+                  [(set rGPR:$Rd,
+                        (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
+                  Requires<[IsThumb, HasV8]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<4> addr;
@@ -3412,6 +3421,15 @@ def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
 def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
             (t2STREXH GPR:$Rt, addr_offset_none:$addr)>;
 
+def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff),
+            (t2LDAEXB addr_offset_none:$addr)>;
+def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDAEXH addr_offset_none:$addr)>;
+def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
 //   eh_sjlj_setjmp() is an instruction sequence to store the return
@@ -3549,7 +3567,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let AsmMatchConverter = "cvtThumbBranches";
 }
 
-// Tail calls. The IOS version of thumb tail calls uses a t2 branch, so
+// Tail calls. The MachO version of thumb tail calls uses a t2 branch, so
 // it goes here.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS version.
@@ -3558,7 +3576,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                    (ins uncondbrtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (t2B uncondbrtarget:$dst, pred:$p)>,
-                 Requires<[IsThumb2, IsIOS]>, Sched<[WriteBr]>;
+                 Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
 }
 
 // IT block
@@ -3781,7 +3799,7 @@ def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
 let isReMaterializable = 1, isMoveImm = 1 in
 def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
                             [(set rGPR:$dst, (i32 imm:$src))]>,
-                            Requires<[IsThumb, HasV6T2]>;
+                            Requires<[IsThumb, UseMovt]>;
 
 // Pseudo instruction that combines movw + movt + add pc (if pic).
 // It also makes it possible to rematerialize the instructions.
@@ -3793,15 +3811,9 @@ def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
                           [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
                           Requires<[IsThumb2, UseMovt]>;
 
-def t2MOV_ga_dyn : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
-                              IIC_iMOVix2,
-                          [(set rGPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
-                          Requires<[IsThumb2, UseMovt]>;
 }
 
 // ConstantPool, GlobalAddress, and JumpTable
-def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
-           Requires<[IsThumb2, DontUseMovt]>;
 def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
            Requires<[IsThumb2, UseMovt]>;
@@ -4371,7 +4383,7 @@ def : t2InstAlias<"ldrsh${p} $Rt, $addr",
                   (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
 
 def : t2InstAlias<"ldr${p} $Rt, $addr",
-                  (t2LDRpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+                  (t2LDRpci GPRnopc:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrb${p} $Rt, $addr",
                   (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p} $Rt, $addr",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index a8cdc5c..1d7802a 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -200,13 +200,34 @@ let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
 defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
 
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpLoad_m, IIC_fpLoad_mu>;
+defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
 
 } // neverHasSideEffects
 
 def : MnemonicAlias<"vldm", "vldmia">;
 def : MnemonicAlias<"vstm", "vstmia">;
 
+// FLDM/FSTM - Load / Store multiple single / double precision registers for
+// pre-ARMv6 cores.
+// These instructions are deprecated!
+def : VFP2MnemonicAlias<"fldmias", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbs", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmeas", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfds", "vldmia">;
+def : VFP2MnemonicAlias<"fldmiad", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbd", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmead", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfdd", "vldmia">;
+
+def : VFP2MnemonicAlias<"fstmias", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbs", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmeas", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfds", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmiad", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmead", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">;
+
 def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>,
                 Requires<[HasVFP2]>;
 def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>,
@@ -247,7 +268,7 @@ multiclass vfp_ldstx_mult<string asm, bit L_bit> {
     AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
           IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     let Inst{24-23} = 0b10;         // Decrement Before
-    let Inst{21}    = 1;
+    let Inst{21}    = 1;            // Writeback
     let Inst{20}    = L_bit;
   }
 }
@@ -255,6 +276,12 @@ multiclass vfp_ldstx_mult<string asm, bit L_bit> {
 defm FLDM : vfp_ldstx_mult<"fldm", 1>;
 defm FSTM : vfp_ldstx_mult<"fstm", 0>;
 
+def : VFP2MnemonicAlias<"fldmeax", "fldmdbx">;
+def : VFP2MnemonicAlias<"fldmfdx", "fldmiax">;
+
+def : VFP2MnemonicAlias<"fstmeax", "fstmiax">;
+def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">;
+
 //===----------------------------------------------------------------------===//
 // FP Binary Operations.
 //
@@ -1639,7 +1666,7 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
 //===----------------------------------------------------------------------===//
 // Assembler aliases.
 //
-// A few mnemnoic aliases for pre-unifixed syntax. We don't guarantee to
+// A few mnemonic aliases for pre-unifixed syntax. We don't guarantee to
 // support them all, but supporting at least some of the basics is
 // good to be friendly.
 def : VFP2MnemonicAlias<"flds", "vldr">;
@@ -1735,3 +1762,14 @@ def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2",
 // VMOVD does.
 def : VFP2InstAlias<"vmov${p} $Sd, $Sm",
                     (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>;
+
+// FCONSTD/FCONSTS alias for vmov.f64/vmov.f32
+// These aliases provide added functionality over vmov.f instructions by
+// allowing users to write assembly containing encoded floating point constants
+// (e.g. #0x70 vs #1.0).  Without these alises there is no way for the
+// assembler to accept encoded fp constants (but the equivalent fp-literal is
+// accepted directly by vmovf).
+def : VFP3InstAlias<"fconstd${p} $Dd, $val",
+                    (FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>;
+def : VFP3InstAlias<"fconsts${p} $Sd, $val",
+                    (FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 351a290..73c6eb7 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -13,10 +13,9 @@
 
 #define DEBUG_TYPE "jit"
 #include "ARMJITInfo.h"
-#include "ARM.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMRelocations.h"
-#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
index 23a6a9b..ee4c863 100644
--- a/lib/Target/ARM/ARMJITInfo.h
+++ b/lib/Target/ARM/ARMJITInfo.h
@@ -53,45 +53,45 @@ namespace llvm {
     /// overwriting OLD with a branch to NEW.  This is used for self-modifying
     /// code.
     ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
 
     /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
     /// to emit an indirect symbol which contains the address of the specified
     /// ptr.
-    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
-                                            JITCodeEmitter &JCE);
+    void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                    JITCodeEmitter &JCE) override;
 
     // getStubLayout - Returns the size and alignment of the largest call stub
     // on ARM.
-    virtual StubLayout getStubLayout();
+    StubLayout getStubLayout() override;
 
     /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
     /// small native function that simply calls the function at the specified
     /// address.
-    virtual void *emitFunctionStub(const Function* F, void *Fn,
-                                   JITCodeEmitter &JCE);
+    void *emitFunctionStub(const Function* F, void *Fn,
+                           JITCodeEmitter &JCE) override;
 
     /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
 
     /// relocate - Before the JIT can run a block of code that has been emitted,
     /// it must rewrite the code to contain the actual addresses of any
     /// referenced global symbols.
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase);
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char* GOTBase) override;
 
     /// hasCustomConstantPool - Allows a target to specify that constant
     /// pool address resolution is handled by the target.
-    virtual bool hasCustomConstantPool() const { return true; }
+    bool hasCustomConstantPool() const override { return true; }
 
     /// hasCustomJumpTables - Allows a target to specify that jumptables
     /// are emitted by the target.
-    virtual bool hasCustomJumpTables() const { return true; }
+    bool hasCustomJumpTables() const override { return true; }
 
     /// allocateSeparateGVMemory - If true, globals should be placed in
     /// separately allocated heap memory rather than in the same
     /// code memory allocated by JITCodeEmitter.
-    virtual bool allocateSeparateGVMemory() const {
+    bool allocateSeparateGVMemory() const override {
 #ifdef __APPLE__
       return true;
 #else
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 61596d5..48e0bd7 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -17,6 +17,7 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -36,7 +37,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -69,9 +69,9 @@ namespace {
     RegScavenger *RS;
     bool isThumb2;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM load / store optimization pass";
     }
 
@@ -484,7 +484,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
     return;
 
   // Merge succeeded, update records.
-  Merges.push_back(prior(Loc));
+  Merges.push_back(std::prev(Loc));
 
   // In gathering loads together, we may have moved the imp-def of a register
   // past one of its uses. This is OK, since we know better than the rest of
@@ -812,7 +812,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   // Try merging with the previous instruction.
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
   if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
       --PrevMBBI;
     if (Mode == ARM_AM::ia &&
@@ -831,7 +831,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   // Try merging with the next instruction.
   MachineBasicBlock::iterator EndMBBI = MBB.end();
   if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
     while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
       ++NextMBBI;
     if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
@@ -959,7 +959,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   // Try merging with the previous instruction.
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
   if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
       --PrevMBBI;
     if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
@@ -978,7 +978,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   // Try merging with the next instruction.
   MachineBasicBlock::iterator EndMBBI = MBB.end();
   if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
     while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
       ++NextMBBI;
     if (!isAM5 &&
@@ -1122,7 +1122,7 @@ void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
   }
 
   if (Loc != MBB.begin())
-    RS->forward(prior(Loc));
+    RS->forward(std::prev(Loc));
 }
 
 static int getMemoryOpOffset(const MachineInstr *MI) {
@@ -1232,7 +1232,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                   getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
         ++NumSTRD2STM;
       }
-      NewBBI = llvm::prior(MBBI);
+      NewBBI = std::prev(MBBI);
     } else {
       // Split into two instructions.
       unsigned NewOpc = (isLd)
@@ -1254,7 +1254,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       OddReg, OddDeadKill, false,
                       BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
-        NewBBI = llvm::prior(MBBI);
+        NewBBI = std::prev(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
                       BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1274,7 +1274,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       EvenReg, EvenDeadKill, EvenUndef,
                       BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
-        NewBBI = llvm::prior(MBBI);
+        NewBBI = std::prev(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
                       OddReg, OddDeadKill, OddUndef,
                       BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1419,7 +1419,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // Find a scratch register.
         unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
         // Process the load / store instructions.
-        RS->forward(prior(MBBI));
+        RS->forward(std::prev(MBBI));
 
         // Merge ops.
         Merges.clear();
@@ -1441,13 +1441,13 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
               ++NumMerges;
 
         // RS may be pointing to an instruction that's deleted.
-        RS->skipTo(prior(MBBI));
+        RS->skipTo(std::prev(MBBI));
       } else if (NumMemOps == 1) {
         // Try folding preceding/trailing base inc/dec into the single
         // load/store.
         if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
           ++NumMerges;
-          RS->forward(prior(MBBI));
+          RS->forward(std::prev(MBBI));
         }
       }
 
@@ -1490,7 +1490,7 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
       (MBBI->getOpcode() == ARM::BX_RET ||
        MBBI->getOpcode() == ARM::tBX_RET ||
        MBBI->getOpcode() == ARM::MOVPCLR)) {
-    MachineInstr *PrevMI = prior(MBBI);
+    MachineInstr *PrevMI = std::prev(MBBI);
     unsigned Opcode = PrevMI->getOpcode();
     if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
         Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
@@ -1550,9 +1550,9 @@ namespace {
     MachineRegisterInfo *MRI;
     MachineFunction *MF;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM pre- register allocation load / store optimization pass";
     }
 
@@ -1724,17 +1724,6 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   return true;
 }
 
-namespace {
-  struct OffsetCompare {
-    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
-      int LOffset = getMemoryOpOffset(LHS);
-      int ROffset = getMemoryOpOffset(RHS);
-      assert(LHS == RHS || LOffset != ROffset);
-      return LOffset > ROffset;
-    }
-  };
-}
-
 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
                                  SmallVectorImpl<MachineInstr *> &Ops,
                                  unsigned Base, bool isLd,
@@ -1742,7 +1731,13 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   bool RetVal = false;
 
   // Sort by offset (in reverse order).
-  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
+  std::sort(Ops.begin(), Ops.end(),
+            [](const MachineInstr *LHS, const MachineInstr *RHS) {
+    int LOffset = getMemoryOpOffset(LHS);
+    int ROffset = getMemoryOpOffset(RHS);
+    assert(LHS == RHS || LOffset != ROffset);
+    return LOffset > ROffset;
+  });
 
   // The loads / stores of the same base are in order. Scan them from first to
   // last and check for the following:
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index e12c9c6..48141b1 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -14,23 +14,25 @@
 
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
 using namespace llvm;
 
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
   const MCExpr *Expr;
-  switch (MO.getTargetFlags()) {
+  unsigned Option = MO.getTargetFlags() & ARMII::MO_OPTION_MASK;
+  switch (Option) {
   default: {
     Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
                                    OutContext);
-    switch (MO.getTargetFlags()) {
+    switch (Option) {
     default: llvm_unreachable("Unknown target flag on symbol operand");
     case 0:
       break;
@@ -49,7 +51,7 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
   }
 
   case ARMII::MO_PLT:
-    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_ARM_PLT,
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_PLT,
                                    OutContext);
     break;
   }
@@ -81,9 +83,11 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
         MO.getMBB()->getSymbol(), OutContext));
     break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
+  case MachineOperand::MO_GlobalAddress: {
+    MCOp = GetSymbolRef(MO,
+                        GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags()));
     break;
+  }
   case MachineOperand::MO_ExternalSymbol:
    MCOp = GetSymbolRef(MO,
                         GetExternalSymbolSymbol(MO.getSymbolName()));
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 010edf3..d7ec6eb 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -38,7 +38,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// StByValParamsPadding - For parameter that is split between
   /// GPRs and memory; while recovering GPRs part, when
-  /// StackAlignment == 8, and GPRs-part-size mod 8 != 0,
+  /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0,
   /// we need to insert gap before parameter start address. It allows to
   /// "attach" GPR-part to the part that was passed via stack.
   unsigned StByValParamsPadding;
@@ -114,6 +114,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// relocation models.
   unsigned GlobalBaseReg;
 
+  /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
+  /// being passed on the stack
+  unsigned ArgumentStackSize;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -182,6 +186,9 @@ public:
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
 
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
   unsigned createJumpTableUId() {
     return JumpTableUId++;
   }
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
new file mode 100644
index 0000000..20619fa
--- /dev/null
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -0,0 +1,101 @@
+//===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between,
+//removed one -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "double barriers"
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+STATISTIC(NumDMBsRemoved, "Number of DMBs removed");
+
+namespace {
+class ARMOptimizeBarriersPass : public MachineFunctionPass {
+public:
+  static char ID;
+  ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  virtual const char *getPassName() const {
+    return "optimise barriers pass";
+  }
+
+private:
+};
+char ARMOptimizeBarriersPass::ID = 0;
+}
+
+// Returns whether the instruction can safely move past a DMB instruction
+// The current implementation allows this iif MI does not have any possible
+// memory access
+static bool CanMovePastDMB(const MachineInstr *MI) {
+  return !(MI->mayLoad() ||
+          MI->mayStore() ||
+          MI->hasUnmodeledSideEffects() ||
+          MI->isCall() ||
+          MI->isReturn());
+}
+
+bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
+  // Vector to store the DMBs we will remove after the first iteration
+  std::vector<MachineInstr *> ToRemove;
+  // DMBType is the Imm value of the first operand. It determines whether it's a
+  // DMB ish, dmb sy, dmb osh, etc
+  int64_t DMBType = -1;
+
+  // Find a dmb. If we can move it until the next dmb, tag the second one for
+  // removal
+  for (auto &MBB : MF) {
+    // Will be true when we have seen a DMB, and not seen any instruction since
+    // that cannot move past a DMB
+    bool IsRemovableNextDMB = false;
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == ARM::DMB) {
+        if (IsRemovableNextDMB) {
+          // If the Imm of this DMB is the same as that of the last DMB, we can
+          // tag this second DMB for removal
+          if (MI.getOperand(0).getImm() == DMBType) {
+            ToRemove.push_back(&MI);
+          } else {
+            // If it has a different DMBType, we cannot remove it, but will scan
+            // for the next DMB, recording this DMB's type as last seen DMB type
+            DMBType = MI.getOperand(0).getImm();
+          }
+        } else {
+          // After we see a DMB, a next one is removable
+          IsRemovableNextDMB = true;
+          DMBType = MI.getOperand(0).getImm();
+        }
+      } else if (!CanMovePastDMB(&MI)) {
+        // If we find an instruction unable to pass past a DMB, a next DMB is
+        // not removable
+        IsRemovableNextDMB = false;
+      }
+    }
+  }
+  // Remove the tagged DMB
+  for (auto MI : ToRemove) {
+    MI->eraseFromParent();
+    ++NumDMBsRemoved;
+  }
+
+  return NumDMBsRemoved > 0;
+}
+
+/// createARMOptimizeBarriersPass - Returns an instance of the remove double
+/// barriers
+/// pass.
+FunctionPass *llvm::createARMOptimizeBarriersPass() {
+  return new ARMOptimizeBarriersPass();
+}
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index a788036..80b4b48 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -12,8 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMRegisterInfo.h"
-#include "ARM.h"
-#include "ARMBaseInstrInfo.h"
 using namespace llvm;
 
 void ARMRegisterInfo::anchor() { }
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index fb1537c..3e6af3f 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -14,9 +14,7 @@
 #ifndef ARMREGISTERINFO_H
 #define ARMREGISTERINFO_H
 
-#include "ARM.h"
 #include "ARMBaseRegisterInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index d045761..7f0fe05 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -214,7 +214,7 @@ def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
 }
 
 // GPRs without the PC but with APSR. Some instructions allow accessing the
-// APSR, while actually encoding PC in the register field. This is usefull
+// APSR, while actually encoding PC in the register field. This is useful
 // for assembly and disassembly only.
 def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
   let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
diff --git a/lib/Target/ARM/ARMRelocations.h b/lib/Target/ARM/ARMRelocations.h
index 9c32b15..21877fd 100644
--- a/lib/Target/ARM/ARMRelocations.h
+++ b/lib/Target/ARM/ARMRelocations.h
@@ -35,10 +35,6 @@ namespace llvm {
       // should be divided by 4.
       reloc_arm_vfp_cp_entry,
 
-      // reloc_arm_so_imm - Same as reloc_arm_cp_entry except contant should be
-      // encoded as so_imm value.
-      reloc_arm_so_imm_cp_entry,
-
       // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool
       // entry.
       reloc_arm_machine_cp_entry,
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 603e775..9a1d222 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1894,16 +1894,26 @@ def CortexA9Model : SchedMachineModel {
   let MispredictPenalty = 8; // Based on estimate of pipeline depth.
 
   let Itineraries = CortexA9Itineraries;
+
+  // FIXME: Many vector operations were never given an itinerary. We
+  // haven't mapped these to the new model either.
+  let CompleteModel = 0;
 }
 
 //===----------------------------------------------------------------------===//
 // Define each kind of processor resource and number available.
+//
+// The AGU unit has BufferSize=1 so that the latency between operations
+// that use it are considered to stall other operations.
+//
+// The FP unit has BufferSize=0 so that it is a hard dispatch
+// hazard. No instruction may be dispatched while the unit is reserved.
 
 let SchedModel = CortexA9Model in {
 
 def A9UnitALU : ProcResource<2>;
 def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
-def A9UnitAGU : ProcResource<1>;
+def A9UnitAGU : ProcResource<1> { let BufferSize = 1; }
 def A9UnitLS  : ProcResource<1>;
 def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
 def A9UnitB   : ProcResource<1>;
@@ -2217,7 +2227,7 @@ def A9WriteLMfp : SchedWriteVariant<[
   SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
 
 //===----------------------------------------------------------------------===//
-// Resources for other (non LDM/VLDM) Variants.
+// Resources for other (non-LDM/VLDM) Variants.
 
 // These mov immediate writers are unconditionally expanded with
 // additive latency.
@@ -2397,6 +2407,7 @@ def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
 // ...
 // VHADD/VRHADD/VQADD/VTST/VADH/VRADH
 def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+
 // VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
 def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
 // VQNEG/VQABS
@@ -2431,7 +2442,7 @@ def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
 def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
 
 // NEON permute
-def :ItinRW<[A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
 def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
             [IIC_VPERMQ3, IIC_VEXTQ]>;
 def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 8d7dbc2..b03d5ff 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1721,7 +1721,7 @@ let SchedModel = SwiftModel in {
     SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
                                 SwiftWriteLM13CyNo, SwiftWriteP01OneCycle,
                                 SwiftVLDMPerm3]>,
-    // Load of a Q register (not neccessarily true). We should not be mapping to
+    // Load of a Q register (not necessarily true). We should not be mapping to
     // 4 S registers, either.
     SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo,
                                 SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>,
@@ -1858,7 +1858,7 @@ let SchedModel = SwiftModel in {
     // Assume 5 D registers.
     SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>,
     SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>,
-    // Asume three Q registers.
+    // Assume three Q registers.
     SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>,
     SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>,
     // Assume 7 D registers.
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 93add6e..ba3cf4d 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -145,8 +145,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         SDValue Src, SDValue Size,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
-  // Use default for non AAPCS (or Darwin) subtargets
-  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetDarwin())
+  // Use default for non-AAPCS (or MachO) subtargets
+  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO())
     return SDValue();
 
   const ARMTargetLowering &TLI =
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 56c9375..8c2397b 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -44,23 +44,21 @@ public:
   explicit ARMSelectionDAGInfo(const TargetMachine &TM);
   ~ARMSelectionDAGInfo();
 
-  virtual
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const;
+                                  MachinePointerInfo SrcPtrInfo) const override;
 
   // Adjust parameters for memset, see RTABI section 4.3.4
-  virtual
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Op1, SDValue Op2,
                                   SDValue Op3, unsigned Align,
                                   bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const;
+                                  MachinePointerInfo DstPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 2d749af..73e2018 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -15,8 +15,8 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
@@ -27,7 +27,7 @@
 
 using namespace llvm;
 
-cl::opt<bool>
+static cl::opt<bool>
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
 
@@ -75,15 +75,17 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
               clEnumValEnd));
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, const TargetOptions &Options)
+                           const std::string &FS, bool IsLittle,
+                           const TargetOptions &Options)
   : ARMGenSubtargetInfo(TT, CPU, FS)
   , ARMProcFamily(Others)
   , ARMProcClass(None)
   , stackAlignment(4)
   , CPUString(CPU)
+  , IsLittle(IsLittle)
   , TargetTriple(TT)
   , Options(Options)
-  , TargetABI(ARM_ABI_APCS) {
+  , TargetABI(ARM_ABI_UNKNOWN) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
 }
@@ -102,6 +104,7 @@ void ARMSubtarget::initializeEnvironment() {
   HasVFPv4 = false;
   HasFPARMv8 = false;
   HasNEON = false;
+  MinSize = false;
   UseNEONForSinglePrecisionFP = false;
   UseMulOps = UseFusedMulOps;
   SlowFPVMLx = false;
@@ -131,6 +134,7 @@ void ARMSubtarget::initializeEnvironment() {
   HasTrustZone = false;
   HasCrypto = false;
   HasCRC = false;
+  HasZeroCycleZeroing = false;
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
   UseNaClTrap = false;
@@ -152,6 +156,9 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
     initializeEnvironment();
     resetSubtargetFeatures(CPU, FS);
   }
+
+  MinSize =
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
 }
 
 void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -176,10 +183,9 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   }
   ParseSubtargetFeatures(CPUString, ArchFS);
 
-  // Thumb2 implies at least V6T2. FIXME: Fix tests to explicitly specify a
-  // ARM version or CPU and then remove this.
-  if (!HasV6T2Ops && hasThumb2())
-    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6MOps = HasV6T2Ops = true;
+  // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
+  // Assert this for now to make the change obvious.
+  assert(hasV6T2Ops() || !hasThumb2());
 
   // Keep a pointer to static instruction cost data for the specified CPU.
   SchedModel = getSchedModelForCPU(CPUString);
@@ -187,22 +193,45 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  if ((TargetTriple.getTriple().find("eabi") != std::string::npos) ||
-      (isTargetIOS() && isMClass()))
-    // FIXME: We might want to separate AAPCS and EABI. Some systems, e.g.
-    // Darwin-EABI conforms to AACPS but not the rest of EABI.
+  if (TargetABI == ARM_ABI_UNKNOWN) {
+    switch (TargetTriple.getEnvironment()) {
+    case Triple::Android:
+    case Triple::EABI:
+    case Triple::EABIHF:
+    case Triple::GNUEABI:
+    case Triple::GNUEABIHF:
+      TargetABI = ARM_ABI_AAPCS;
+      break;
+    default:
+      if ((isTargetIOS() && isMClass()) ||
+          (TargetTriple.isOSBinFormatMachO() &&
+           TargetTriple.getOS() == Triple::UnknownOS))
+        TargetABI = ARM_ABI_AAPCS;
+      else
+        TargetABI = ARM_ABI_APCS;
+      break;
+    }
+  }
+
+  // FIXME: this is invalid for WindowsCE
+  if (isTargetWindows()) {
     TargetABI = ARM_ABI_AAPCS;
+    NoARM = true;
+  }
 
   if (isAAPCS_ABI())
     stackAlignment = 8;
+  if (isTargetNaCl())
+    stackAlignment = 16;
 
   UseMovt = hasV6T2Ops() && ArmUseMOVT;
 
-  if (!isTargetIOS()) {
-    IsR9Reserved = ReserveR9;
-  } else {
+  if (isTargetMachO()) {
     IsR9Reserved = ReserveR9 | !HasV6Ops;
-    SupportsTailCall = !getTargetTriple().isOSVersionLT(5, 0);
+    SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0);
+  } else {
+    IsR9Reserved = ReserveR9;
+    SupportsTailCall = !isThumb1Only();
   }
 
   if (!isThumb() || hasThumb2())
@@ -214,7 +243,7 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
       //
       // ARMv6 may or may not support unaligned accesses depending on the
       // SCTLR.U bit, which is architecture-specific. We assume ARMv6
-      // Darwin targets support unaligned accesses, and others don't.
+      // Darwin and NetBSD targets support unaligned accesses, and others don't.
       //
       // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
       // which raises an alignment fault on unaligned accesses. Linux
@@ -223,9 +252,15 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
       // Linux targets support unaligned accesses. The same goes for NaCl.
       //
       // The above behavior is consistent with GCC.
-      AllowsUnalignedMem = (
-          (hasV7Ops() && (isTargetLinux() || isTargetNaCl())) ||
-          (hasV6Ops() && isTargetDarwin()));
+      AllowsUnalignedMem =
+          (hasV7Ops() && (isTargetLinux() || isTargetNaCl() ||
+                          isTargetNetBSD())) ||
+          (hasV6Ops() && (isTargetMachO() || isTargetNetBSD()));
+      // The one exception is cortex-m0, which despite being v6, does not
+      // support unaligned accesses. Rather than make the above boolean
+      // expression even more obtuse, just override the value here.
+      if (isThumb1Only() && isMClass())
+        AllowsUnalignedMem = false;
       break;
     case StrictAlign:
       AllowsUnalignedMem = false;
@@ -267,7 +302,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
   if (GV->isDeclaration() && !GV->isMaterializable())
     isDecl = true;
 
-  if (!isTargetDarwin()) {
+  if (!isTargetMachO()) {
     // Extra load is needed for all externally visible.
     if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
       return false;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 567463c..3855419 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -31,7 +31,8 @@ class TargetOptions;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift, CortexA53, CortexA57
+    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, 
+    CortexR5, Swift, CortexA53, CortexA57, Krait
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
@@ -63,6 +64,10 @@ protected:
   bool HasFPARMv8;
   bool HasNEON;
 
+  /// MinSize - True if the function being compiled has the "minsize" attribute
+  /// and should be optimised for size at the expense of speed.
+  bool MinSize;
+
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
   /// specified. Use the method useNEONForSinglePrecisionFP() to
   /// determine if NEON should actually be used.
@@ -172,6 +177,10 @@ protected:
   /// HasCRC - if true, processor supports CRC instructions
   bool HasCRC;
 
+  /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
+  /// particularly effective at zeroing a VFP register.
+  bool HasZeroCycleZeroing;
+
   /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
@@ -188,12 +197,12 @@ protected:
   /// NaCl TRAP instruction is generated instead of the regular TRAP.
   bool UseNaClTrap;
 
-  /// Target machine allowed unsafe FP math (such as use of NEON fp)
-  bool UnsafeFPMath;
-
   /// Force long to be a 64-bit type (RenderScript-specific)
   bool UseLong64;
 
+  /// Target machine allowed unsafe FP math (such as use of NEON fp)
+  bool UnsafeFPMath;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -201,6 +210,9 @@ protected:
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
+  /// IsLittle - The target is Little Endian
+  bool IsLittle;
+
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
@@ -215,6 +227,7 @@ protected:
 
  public:
   enum {
+    ARM_ABI_UNKNOWN,
     ARM_ABI_APCS,
     ARM_ABI_AAPCS // ARM EABI
   } TargetABI;
@@ -223,7 +236,8 @@ protected:
   /// of the specified triple.
   ///
   ARMSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, const TargetOptions &Options);
+               const std::string &FS, bool IsLittle,
+               const TargetOptions &Options);
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -237,7 +251,7 @@ protected:
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   /// \brief Reset the features for the ARM target.
-  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -254,13 +268,15 @@ public:
   bool hasV8Ops()   const { return HasV8Ops;  }
 
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
+  bool isCortexA7() const { return ARMProcFamily == CortexA7; }
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
   bool isCortexA9() const { return ARMProcFamily == CortexA9; }
   bool isCortexA15() const { return ARMProcFamily == CortexA15; }
   bool isSwift()    const { return ARMProcFamily == Swift; }
   bool isCortexM3() const { return CPUString == "cortex-m3"; }
-  bool isLikeA9() const { return isCortexA9() || isCortexA15(); }
+  bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
   bool isCortexR5() const { return ARMProcFamily == CortexR5; }
+  bool isKrait() const { return ARMProcFamily == Krait; }
 
   bool hasARMOps() const { return !NoARM; }
 
@@ -272,6 +288,7 @@ public:
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
   bool hasVirtualization() const { return HasVirtualization; }
+  bool isMinSize() const { return MinSize; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
@@ -289,6 +306,7 @@ public:
   bool isFPOnlySP() const { return FPOnlySP; }
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasTrustZone() const { return HasTrustZone; }
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
@@ -302,22 +320,59 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
-  bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
-  bool isTargetELF() const { return !isTargetDarwin(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetNetBSD() const { return TargetTriple.getOS() == Triple::NetBSD; }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
   // ARM EABI is the bare-metal EABI described in ARM ABI documents and
   // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
   // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
   // even for GNUEABI, so we can make a distinction here and still conform to
   // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+  // FIXME: The Darwin exception is temporary, while we move users to
+  // "*-*-*-macho" triples as quickly as possible.
   bool isTargetAEABI() const {
-    return TargetTriple.getEnvironment() == Triple::EABI;
+    return (TargetTriple.getEnvironment() == Triple::EABI ||
+            TargetTriple.getEnvironment() == Triple::EABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
   }
 
-  bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
-  bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
+  // ARM Targets that support EHABI exception handling standard
+  // Darwin uses SjLj. Other targets might need more checks.
+  bool isTargetEHABICompatible() const {
+    return (TargetTriple.getEnvironment() == Triple::EABI ||
+            TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::EABIHF ||
+            TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+            TargetTriple.getEnvironment() == Triple::Android) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+
+  bool isTargetHardFloat() const {
+    // FIXME: this is invalid for WindowsCE
+    return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+           TargetTriple.getEnvironment() == Triple::EABIHF ||
+           isTargetWindows();
+  }
+  bool isTargetAndroid() const {
+    return TargetTriple.getEnvironment() == Triple::Android;
+  }
+
+  bool isAPCS_ABI() const {
+    assert(TargetABI != ARM_ABI_UNKNOWN);
+    return TargetABI == ARM_ABI_APCS;
+  }
+  bool isAAPCS_ABI() const {
+    assert(TargetABI != ARM_ABI_UNKNOWN);
+    return TargetABI == ARM_ABI_AAPCS;
+  }
 
   bool isThumb() const { return InThumbMode; }
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
@@ -329,7 +384,7 @@ public:
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
-  bool useMovt() const { return UseMovt && hasV6T2Ops(); }
+  bool useMovt() const { return UseMovt && !isMinSize(); }
   bool supportsTailCall() const { return SupportsTailCall; }
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
@@ -338,6 +393,8 @@ public:
 
   const std::string & getCPUString() const { return CPUString; }
 
+  bool isLittle() const { return IsLittle; }
+
   unsigned getMispredictionPenalty() const;
   
   /// This function returns true if the target has sincos() routine in its
@@ -347,7 +404,7 @@ public:
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const;
+                             RegClassVector& CriticalPathRCs) const override;
 
   /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index be84bf6..4ae539a 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMTargetMachine.h"
 #include "ARM.h"
+#include "ARMTargetMachine.h"
 #include "ARMFrameLowering.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -23,11 +23,6 @@
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
-cl::opt<bool>
-EnableGlobalMerge("global-merge", cl::Hidden,
-                  cl::desc("Enable global merge pass"),
-                  cl::init(true));
-
 static cl::opt<bool>
 DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
                    cl::desc("Inhibit optimization of S->D register accesses on A15"),
@@ -35,8 +30,10 @@ DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
 
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
-  RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
-  RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
+  RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget);
+  RegisterTargetMachine<ARMBETargetMachine> Y(TheARMBETarget);
+  RegisterTargetMachine<ThumbLETargetMachine> A(TheThumbLETarget);
+  RegisterTargetMachine<ThumbBETargetMachine> B(TheThumbBETarget);
 }
 
 
@@ -46,14 +43,17 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
+                                           CodeGenOpt::Level OL,
+                                           bool isLittle)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, Options),
+    Subtarget(TT, CPU, FS, isLittle, Options),
     JITInfo(),
     InstrItins(Subtarget.getInstrItineraryData()) {
-  // Default to soft float ABI
+
+  // Default to triple-appropriate float ABI
   if (Options.FloatABIType == FloatABI::Default)
-    this->Options.FloatABIType = FloatABI::Soft;
+    this->Options.FloatABIType =
+        Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
 }
 
 void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
@@ -67,21 +67,71 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
 
 void ARMTargetMachine::anchor() { }
 
+static std::string computeDataLayout(ARMSubtarget &ST) {
+  std::string Ret = "";
+
+  if (ST.isLittle())
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
+  // align to 32.
+  if (ST.isThumb())
+    Ret += "-i1:8:32-i8:8:32-i16:16:32";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (!ST.isAPCS_ABI())
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ST.isAPCS_ABI())
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ST.isAPCS_ABI())
+    Ret += "-v64:32:64-v128:32:128";
+  else
+    Ret += "-v128:64:128";
+
+  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
+  // 64 bits).
+  if (ST.isThumb() || ST.isAPCS_ABI())
+    Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+  // aligned everywhere else.
+  if (ST.isTargetNaCl())
+    Ret += "-S128";
+  else if (ST.isAAPCS_ABI())
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
 ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+                                   CodeGenOpt::Level OL,
+                                   bool isLittle)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle),
     InstrInfo(Subtarget),
-    DL(Subtarget.isAPCS_ABI() ?
-               std::string("e-p:32:32-f64:32:64-i64:32:64-"
-                           "v128:32:128-v64:32:64-n32-S32") :
-               Subtarget.isAAPCS_ABI() ?
-               std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "v128:64:128-v64:64:64-n32-S64") :
-               std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "v128:64:128-v64:64:64-n32-S32")),
+    DL(computeDataLayout(Subtarget)),
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget) {
@@ -91,28 +141,37 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                        "support ARM mode execution!");
 }
 
+void ARMLETargetMachine::anchor() { }
+
+ARMLETargetMachine::
+ARMLETargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ARMBETargetMachine::anchor() { }
+
+ARMBETargetMachine::
+ARMBETargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
 void ThumbTargetMachine::anchor() { }
 
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+                                       CodeGenOpt::Level OL,
+                                       bool isLittle)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle),
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
-    DL(Subtarget.isAPCS_ABI() ?
-               std::string("e-p:32:32-f64:32:64-i64:32:64-"
-                           "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:32:128-v64:32:64-a:0:32-n32-S32") :
-               Subtarget.isAAPCS_ABI() ?
-               std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:64:128-v64:64:64-a:0:32-n32-S64") :
-               std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:64:128-v64:64:64-a:0:32-n32-S32")),
+    DL(computeDataLayout(Subtarget)),
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget.hasThumb2()
@@ -121,6 +180,24 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
   initAsmInfo();
 }
 
+void ThumbLETargetMachine::anchor() { }
+
+ThumbLETargetMachine::
+ThumbLETargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ThumbBETargetMachine::anchor() { }
+
+ThumbBETargetMachine::
+ThumbBETargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
 namespace {
 /// ARM Code Generator Pass Configuration Options.
 class ARMPassConfig : public TargetPassConfig {
@@ -136,11 +213,11 @@ public:
     return *getARMTargetMachine().getSubtargetImpl();
   }
 
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
@@ -149,7 +226,11 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool ARMPassConfig::addPreISel() {
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
+  const ARMSubtarget *Subtarget = &getARMSubtarget();
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
+    addPass(createARMAtomicExpandPass(TM));
+
+  if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
 
   return false;
@@ -219,6 +300,7 @@ bool ARMPassConfig::addPreEmitPass() {
     addPass(&UnpackMachineBundlesID);
   }
 
+  addPass(createARMOptimizeBarriersPass());
   addPass(createARMConstantIslandPass());
 
   return true;
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index d4caf5c..0c80a95 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -42,25 +42,26 @@ public:
                        StringRef CPU, StringRef FS,
                        const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
+                       CodeGenOpt::Level OL,
+                       bool isLittle);
 
-  virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
-  virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
-  virtual const ARMTargetLowering *getTargetLowering() const {
+  ARMJITInfo *getJITInfo() override { return &JITInfo; }
+  const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const ARMTargetLowering *getTargetLowering() const override {
     // Implemented by derived classes
     llvm_unreachable("getTargetLowering not implemented");
   }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
 
   /// \brief Register ARM analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &MCE);
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &MCE) override;
 };
 
 /// ARMTargetMachine - ARM target machine.
@@ -77,24 +78,47 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
                    StringRef CPU, StringRef FS,
                    const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL);
+                   CodeGenOpt::Level OL,
+                   bool isLittle);
 
-  virtual const ARMRegisterInfo  *getRegisterInfo() const {
+  const ARMRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const ARMTargetLowering *getTargetLowering() const {
+  const ARMTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const ARMSelectionDAGInfo* getSelectionDAGInfo() const {
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual const ARMFrameLowering *getFrameLowering() const {
+  const ARMFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const ARMInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+};
+
+/// ARMLETargetMachine - ARM little endian target machine.
+///
+class ARMLETargetMachine : public ARMTargetMachine {
+  virtual void anchor();
+public:
+  ARMLETargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// ARMBETargetMachine - ARM big endian target machine.
+///
+class ARMBETargetMachine : public ARMTargetMachine {
+  virtual void anchor();
+public:
+  ARMBETargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 };
 
 /// ThumbTargetMachine - Thumb target machine.
@@ -115,30 +139,53 @@ public:
                      StringRef CPU, StringRef FS,
                      const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
+                     CodeGenOpt::Level OL,
+                     bool isLittle);
 
   /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
-  virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
+  const ARMBaseRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
 
-  virtual const ARMTargetLowering *getTargetLowering() const {
+  const ARMTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const ARMSelectionDAGInfo *getSelectionDAGInfo() const {
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
   /// returns either Thumb1InstrInfo or Thumb2InstrInfo
-  virtual const ARMBaseInstrInfo *getInstrInfo() const {
+  const ARMBaseInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
   /// returns either Thumb1FrameLowering or ARMFrameLowering
-  virtual const ARMFrameLowering *getFrameLowering() const {
+  const ARMFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
   }
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+};
+
+/// ThumbLETargetMachine - Thumb little endian target machine.
+///
+class ThumbLETargetMachine : public ThumbTargetMachine {
+  virtual void anchor();
+public:
+  ThumbLETargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// ThumbBETargetMachine - Thumb big endian target machine.
+///
+class ThumbBETargetMachine : public ThumbTargetMachine {
+  virtual void anchor();
+public:
+  ThumbBETargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 7ec71b2..3379f85 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -10,13 +10,13 @@
 #include "ARMTargetObjectFile.h"
 #include "ARMSubtarget.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 using namespace dwarf;
 
@@ -41,13 +41,18 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                SectionKind::getMetadata());
 }
 
-const MCExpr *ARMElfTargetObjectFile::
-getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                        MachineModuleInfo *MMI, unsigned Encoding,
-                        MCStreamer &Streamer) const {
+const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::Create(getSymbol(*Mang, GV),
-                                 MCSymbolRefExpr::VK_ARM_TARGET2,
+  return MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang),
+                                 MCSymbolRefExpr::VK_ARM_TARGET2, getContext());
+}
+
+const MCExpr *ARMElfTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO,
                                  getContext());
 }
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 7f60727..5f8d612 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -26,16 +26,16 @@ public:
     AttributesSection(NULL)
   {}
 
-  virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
   const MCExpr *
-  getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                          MachineModuleInfo *MMI, unsigned Encoding,
-                          MCStreamer &Streamer) const;
-  
-  virtual const MCSection *getAttributesSection() const {
-    return AttributesSection;
-  }
+  getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                          Mangler &Mang, const TargetMachine &TM,
+                          MachineModuleInfo *MMI,
+                          MCStreamer &Streamer) const override;
+
+  /// \brief Describe a TLS variable address within debug info.
+  const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6bbb38f..d3b43cd 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -19,8 +19,8 @@
 #include "ARMTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -32,7 +32,7 @@ void initializeARMTTIPass(PassRegistry &);
 
 namespace {
 
-class ARMTTI : public ImmutablePass, public TargetTransformInfo {
+class ARMTTI final : public ImmutablePass, public TargetTransformInfo {
   const ARMBaseTargetMachine *TM;
   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;
@@ -52,15 +52,11 @@ public:
     initializeARMTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() {
+  void initializePass() override {
     pushTTIStack(this);
   }
 
-  virtual void finalizePass() {
-    popTTIStack();
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -68,7 +64,7 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;
@@ -76,8 +72,8 @@ public:
 
   /// \name Scalar TTI Implementations
   /// @{
-
-  virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
+  using TargetTransformInfo::getIntImmCost;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
 
   /// @}
 
@@ -85,7 +81,7 @@ public:
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) const {
+  unsigned getNumberOfRegisters(bool Vector) const override {
     if (Vector) {
       if (ST->hasNEON())
         return 16;
@@ -94,10 +90,10 @@ public:
 
     if (ST->isThumb1Only())
       return 8;
-    return 16;
+    return 13;
   }
 
-  unsigned getRegisterBitWidth(bool Vector) const {
+  unsigned getRegisterBitWidth(bool Vector) const override {
     if (Vector) {
       if (ST->hasNEON())
         return 128;
@@ -107,7 +103,7 @@ public:
     return 32;
   }
 
-  unsigned getMaximumUnrollFactor() const {
+  unsigned getMaximumUnrollFactor() const override {
     // These are out of order CPUs:
     if (ST->isCortexA15() || ST->isSwift())
       return 2;
@@ -115,23 +111,27 @@ public:
   }
 
   unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                          int Index, Type *SubTp) const;
+                          int Index, Type *SubTp) const override;
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                                      Type *Src) const;
+                            Type *Src) const override;
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const;
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy) const override;
 
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) const override;
 
-  unsigned getAddressComputationCost(Type *Val, bool IsComplex) const;
+  unsigned getAddressComputationCost(Type *Val,
+                                     bool IsComplex) const override;
 
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Op1Info = OK_AnyValue,
-                                  OperandValueKind Op2Info = OK_AnyValue) const;
+  unsigned
+  getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                         OperandValueKind Op1Info = OK_AnyValue,
+                         OperandValueKind Op2Info = OK_AnyValue) const override;
 
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const;
+                           unsigned AddressSpace) const override;
   /// @}
 };
 
@@ -162,25 +162,25 @@ unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
       return 1;
     return ST->hasV6T2Ops() ? 2 : 3;
-  } else if (ST->isThumb2()) {
+  }
+  if (ST->isThumb2()) {
     if ((SImmVal >= 0 && SImmVal < 65536) ||
         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
       return 1;
     return ST->hasV6T2Ops() ? 2 : 3;
-  } else /*Thumb1*/ {
-    if (SImmVal >= 0 && SImmVal < 256)
-      return 1;
-    if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
-      return 2;
-    // Load from constantpool.
-    return 3;
   }
-  return 2;
+  // Thumb1.
+  if (SImmVal >= 0 && SImmVal < 256)
+    return 1;
+  if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+    return 2;
+  // Load from constantpool.
+  return 3;
 }
 
 unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
+                                  Type *Src) const {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -469,7 +469,8 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
   return LT.first * NEONShuffleTbl[Idx].Cost;
 }
 
-unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
+unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                        OperandValueKind Op1Info,
                                         OperandValueKind Op2Info) const {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
@@ -533,7 +534,7 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
   // creates a sequence of shift, and, or instructions to construct values.
   // These sequences are recognized by the ISel and have zero-cost. Not so for
   // the vectorized code. Because we have support for v2i64 but not i64 those
-  // sequences look particularily beneficial to vectorize.
+  // sequences look particularly beneficial to vectorize.
   // To work around this we increase the cost of v2i64 operations to make them
   // seem less beneficial.
   if (LT.second == MVT::v2i64 &&
diff --git a/lib/Target/ARM/Android.mk b/lib/Target/ARM/Android.mk
index 86649b2..4be95aa 100644
--- a/lib/Target/ARM/Android.mk
+++ b/lib/Target/ARM/Android.mk
@@ -17,6 +17,7 @@ arm_codegen_TBLGEN_TABLES := \
 arm_codegen_SRC_FILES := \
   A15SDOptimizer.cpp \
   ARMAsmPrinter.cpp \
+  ARMAtomicExpandPass.cpp \
   ARMBaseInstrInfo.cpp \
   ARMBaseRegisterInfo.cpp \
   ARMCodeEmitter.cpp \
@@ -33,6 +34,7 @@ arm_codegen_SRC_FILES := \
   ARMLoadStoreOptimizer.cpp \
   ARMMCInstLower.cpp \
   ARMMachineFunctionInfo.cpp \
+  ARMOptimizeBarriersPass.cpp \
   ARMRegisterInfo.cpp \
   ARMSelectionDAGInfo.cpp \
   ARMSubtarget.cpp \
@@ -67,6 +69,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -81,3 +84,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index e3f9e0d..9c57a24 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -7,33 +7,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMBuildAttrs.h"
 #include "ARMFPUName.h"
 #include "ARMFeatures.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMArchName.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
@@ -48,31 +58,90 @@ class ARMOperand;
 
 enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
 
+class UnwindContext {
+  MCAsmParser &Parser;
+
+  typedef SmallVector<SMLoc, 4> Locs;
+
+  Locs FnStartLocs;
+  Locs CantUnwindLocs;
+  Locs PersonalityLocs;
+  Locs PersonalityIndexLocs;
+  Locs HandlerDataLocs;
+  int FPReg;
+
+public:
+  UnwindContext(MCAsmParser &P) : Parser(P), FPReg(ARM::SP) {}
+
+  bool hasFnStart() const { return !FnStartLocs.empty(); }
+  bool cantUnwind() const { return !CantUnwindLocs.empty(); }
+  bool hasHandlerData() const { return !HandlerDataLocs.empty(); }
+  bool hasPersonality() const {
+    return !(PersonalityLocs.empty() && PersonalityIndexLocs.empty());
+  }
+
+  void recordFnStart(SMLoc L) { FnStartLocs.push_back(L); }
+  void recordCantUnwind(SMLoc L) { CantUnwindLocs.push_back(L); }
+  void recordPersonality(SMLoc L) { PersonalityLocs.push_back(L); }
+  void recordHandlerData(SMLoc L) { HandlerDataLocs.push_back(L); }
+  void recordPersonalityIndex(SMLoc L) { PersonalityIndexLocs.push_back(L); }
+
+  void saveFPReg(int Reg) { FPReg = Reg; }
+  int getFPReg() const { return FPReg; }
+
+  void emitFnStartLocNotes() const {
+    for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end();
+         FI != FE; ++FI)
+      Parser.Note(*FI, ".fnstart was specified here");
+  }
+  void emitCantUnwindLocNotes() const {
+    for (Locs::const_iterator UI = CantUnwindLocs.begin(),
+                              UE = CantUnwindLocs.end(); UI != UE; ++UI)
+      Parser.Note(*UI, ".cantunwind was specified here");
+  }
+  void emitHandlerDataLocNotes() const {
+    for (Locs::const_iterator HI = HandlerDataLocs.begin(),
+                              HE = HandlerDataLocs.end(); HI != HE; ++HI)
+      Parser.Note(*HI, ".handlerdata was specified here");
+  }
+  void emitPersonalityLocNotes() const {
+    for (Locs::const_iterator PI = PersonalityLocs.begin(),
+                              PE = PersonalityLocs.end(),
+                              PII = PersonalityIndexLocs.begin(),
+                              PIE = PersonalityIndexLocs.end();
+         PI != PE || PII != PIE;) {
+      if (PI != PE && (PII == PIE || PI->getPointer() < PII->getPointer()))
+        Parser.Note(*PI++, ".personality was specified here");
+      else if (PII != PIE && (PI == PE || PII->getPointer() < PI->getPointer()))
+        Parser.Note(*PII++, ".personalityindex was specified here");
+      else
+        llvm_unreachable(".personality and .personalityindex cannot be "
+                         "at the same location");
+    }
+  }
+
+  void reset() {
+    FnStartLocs = Locs();
+    CantUnwindLocs = Locs();
+    PersonalityLocs = Locs();
+    HandlerDataLocs = Locs();
+    PersonalityIndexLocs = Locs();
+    FPReg = ARM::SP;
+  }
+};
+
 class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   const MCInstrInfo &MII;
   const MCRegisterInfo *MRI;
+  UnwindContext UC;
 
   ARMTargetStreamer &getTargetStreamer() {
-    MCTargetStreamer &TS = getParser().getStreamer().getTargetStreamer();
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<ARMTargetStreamer &>(TS);
   }
 
-  // Unwind directives state
-  SMLoc FnStartLoc;
-  SMLoc CantUnwindLoc;
-  SMLoc PersonalityLoc;
-  SMLoc HandlerDataLoc;
-  int FPReg;
-  void resetUnwindDirectiveParserState() {
-    FnStartLoc = SMLoc();
-    CantUnwindLoc = SMLoc();
-    PersonalityLoc = SMLoc();
-    HandlerDataLoc = SMLoc();
-    FPReg = -1;
-  }
-
   // Map of register aliases registers via the .req directive.
   StringMap<unsigned> RegisterReqs;
 
@@ -111,6 +180,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
+  void Note(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None) {
+    return Parser.Note(L, Msg, Ranges);
+  }
   bool Warning(SMLoc L, const Twine &Msg,
                ArrayRef<SMRange> Ranges = None) {
     return Parser.Warning(L, Msg, Ranges);
@@ -129,7 +201,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
   bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
                               unsigned &ShiftAmount);
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseLiteralValues(unsigned Size, SMLoc L);
   bool parseDirectiveThumb(SMLoc L);
   bool parseDirectiveARM(SMLoc L);
   bool parseDirectiveThumbFunc(SMLoc L);
@@ -149,6 +221,17 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSetFP(SMLoc L);
   bool parseDirectivePad(SMLoc L);
   bool parseDirectiveRegSave(SMLoc L, bool IsVector);
+  bool parseDirectiveInst(SMLoc L, char Suffix = '\0');
+  bool parseDirectiveLtorg(SMLoc L);
+  bool parseDirectiveEven(SMLoc L);
+  bool parseDirectivePersonalityIndex(SMLoc L);
+  bool parseDirectiveUnwindRaw(SMLoc L);
+  bool parseDirectiveTLSDescSeq(SMLoc L);
+  bool parseDirectiveMovSP(SMLoc L);
+  bool parseDirectiveObjectArch(SMLoc L);
+  bool parseDirectiveArchExtension(SMLoc L);
+  bool parseDirectiveAlign(SMLoc L);
+  bool parseDirectiveThumbSet(SMLoc L);
 
   StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
                           bool &CarrySetting, unsigned &ProcessorIMod,
@@ -241,7 +324,7 @@ class ARMAsmParser : public MCTargetAsmParser {
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
   void cvtThumbBranches(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-                        
+
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
@@ -263,7 +346,7 @@ public:
 
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
                const MCInstrInfo &MII)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), FPReg(-1) {
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), UC(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
@@ -279,21 +362,22 @@ public:
   }
 
   // Implementation of the MCTargetAsmParser interface:
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool ParseDirective(AsmToken DirectiveID);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool
+  ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                   SMLoc NameLoc,
+                   SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
-  unsigned checkTargetMatchPredicate(MCInst &Inst);
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
-  void onLabelParsed(MCSymbol *Symbol);
-
+                               bool MatchingInlineAsm) override;
+  void onLabelParsed(MCSymbol *Symbol) override;
 };
 } // end anonymous namespace
 
@@ -542,9 +626,9 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getStartLoc() const override { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
   /// getLocRange - Get the range between the first and last token of this
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
@@ -564,7 +648,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
     return Reg.RegNum;
   }
@@ -612,7 +696,7 @@ public:
   bool isCCOut() const { return Kind == k_CCOut; }
   bool isITMask() const { return Kind == k_ITCondMask; }
   bool isITCondCode() const { return Kind == k_CondCode; }
-  bool isImm() const { return Kind == k_Immediate; }
+  bool isImm() const override { return Kind == k_Immediate; }
   // checks whether this operand is an unsigned offset which fits is a field
   // of specified width and scaled by a specific number of bits
   template<unsigned width, unsigned scale>
@@ -988,14 +1072,14 @@ public:
     int64_t Value = CE->getValue();
     return Value == 1 || Value == 0;
   }
-  bool isReg() const { return Kind == k_Register; }
+  bool isReg() const override { return Kind == k_Register; }
   bool isRegList() const { return Kind == k_RegisterList; }
   bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
-  bool isToken() const { return Kind == k_Token; }
+  bool isToken() const override { return Kind == k_Token; }
   bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
   bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
-  bool isMem() const { return Kind == k_Memory; }
+  bool isMem() const override { return Kind == k_Memory; }
   bool isShifterImm() const { return Kind == k_ShifterImmediate; }
   bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
   bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
@@ -1282,6 +1366,7 @@ public:
   }
 
   bool isVecListDPairSpaced() const {
+    if (Kind != k_VectorList) return false;
     if (isSingleSpacedVectorList()) return false;
     return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID]
               .contains(VectorList.RegNum));
@@ -1580,7 +1665,7 @@ public:
   void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
     assert(isRegShiftedReg() &&
-           "addRegShiftedRegOperands() on non RegShiftedReg!");
+           "addRegShiftedRegOperands() on non-RegShiftedReg!");
     Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.SrcReg));
     Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.ShiftReg));
     Inst.addOperand(MCOperand::CreateImm(
@@ -1590,7 +1675,7 @@ public:
   void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     assert(isRegShiftedImm() &&
-           "addRegShiftedImmOperands() on non RegShiftedImm!");
+           "addRegShiftedImmOperands() on non-RegShiftedImm!");
     Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
     // Shift of #32 is encoded as 0 where permitted
     unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
@@ -2230,7 +2315,7 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Imm | 0x1e00));
   }
 
-  virtual void print(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const override;
 
   static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) {
     ARMOperand *Op = new ARMOperand(k_ITCondMask);
@@ -2700,7 +2785,8 @@ int ARMAsmParser::tryParseShiftRegister(
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier))
+    return -1; 
 
   std::string lowerCase = Tok.getString().lower();
   ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
@@ -3518,7 +3604,7 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       Error(Loc, "illegal expression");
       return MatchOperand_ParseFail;
     }
-    
+
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID);
     if (!CE) {
       Error(Loc, "constant expression expected");
@@ -4117,7 +4203,7 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     isAdd = false;
     haveEaten = true;
   }
-  
+
   Tok = Parser.getTok();
   int Reg = tryParseRegister();
   if (Reg == -1) {
@@ -4190,7 +4276,7 @@ cvtThumbBranches(MCInst &Inst,
         break;
     }
   }
-  
+
   // now decide on encoding size based on branch target range
   switch(Inst.getOpcode()) {
     // classify tB as either t2B or t1B based on range of immediate operand
@@ -4491,8 +4577,12 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // for these:
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
   ARMOperand *TyOp = static_cast<ARMOperand*>(Operands[2]);
-  if (!TyOp->isToken() || (TyOp->getToken() != ".f32" &&
-                           TyOp->getToken() != ".f64"))
+  bool isVmovf = TyOp->isToken() && (TyOp->getToken() == ".f32" ||
+                                     TyOp->getToken() == ".f64");
+  ARMOperand *Mnemonic = static_cast<ARMOperand*>(Operands[0]);
+  bool isFconst = Mnemonic->isToken() && (Mnemonic->getToken() == "fconstd" ||
+                                          Mnemonic->getToken() == "fconsts");
+  if (!(isVmovf || isFconst))
     return MatchOperand_NoMatch;
 
   Parser.Lex(); // Eat '#' or '$'.
@@ -4505,7 +4595,7 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   }
   const AsmToken &Tok = Parser.getTok();
   SMLoc Loc = Tok.getLoc();
-  if (Tok.is(AsmToken::Real)) {
+  if (Tok.is(AsmToken::Real) && isVmovf) {
     APFloat RealVal(APFloat::IEEEsingle, Tok.getString());
     uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
     // If we had a '-' in front, toggle the sign bit.
@@ -4518,15 +4608,16 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   }
   // Also handle plain integers. Instructions which allow floating point
   // immediates also allow a raw encoded 8-bit value.
-  if (Tok.is(AsmToken::Integer)) {
+  if (Tok.is(AsmToken::Integer) && isFconst) {
     int64_t Val = Tok.getIntVal();
     Parser.Lex(); // Eat the token.
     if (Val > 255 || Val < 0) {
       Error(Loc, "encoded floating point value out of range");
       return MatchOperand_ParseFail;
     }
-    double RealVal = ARM_AM::getFPImmFloat(Val);
-    Val = APFloat(APFloat::IEEEdouble, RealVal).bitcastToAPInt().getZExtValue();
+    float RealVal = ARM_AM::getFPImmFloat(Val);
+    Val = APFloat(RealVal).bitcastToAPInt().getZExtValue();
+
     Operands.push_back(ARMOperand::CreateImm(
         MCConstantExpr::Create(Val, getContext()), S,
         Parser.getTok().getLoc()));
@@ -4623,7 +4714,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
 
       // There can be a trailing '!' on operands that we want as a separate
-      // '!' Token operand. Handle that here. For example, the compatibilty
+      // '!' Token operand. Handle that here. For example, the compatibility
       // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'.
       if (Parser.getTok().is(AsmToken::Exclaim)) {
         Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(),
@@ -4653,6 +4744,20 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
     return false;
   }
+  case AsmToken::Equal: {
+    if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+      return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+
+    Parser.Lex(); // Eat '='
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal);
+    Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E));
+    return false;
+  }
   }
 }
 
@@ -4661,6 +4766,10 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
 bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   RefKind = ARMMCExpr::VK_ARM_None;
 
+  // consume an optional '#' (GNU compatibility)
+  if (getLexer().is(AsmToken::Hash))
+    Parser.Lex();
+
   // :lower16: and :upper16: modifiers
   assert(getLexer().is(AsmToken::Colon) && "expected a :");
   Parser.Lex(); // Eat ':'
@@ -4763,7 +4872,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
         Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
         Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
-        Mnemonic == "vfms" || Mnemonic == "vfnms" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
@@ -5009,12 +5118,40 @@ static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
 }
 static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
                                  unsigned VariantID);
+
+static bool RequiresVFPRegListValidation(StringRef Inst,
+                                         bool &AcceptSinglePrecisionOnly,
+                                         bool &AcceptDoublePrecisionOnly) {
+  if (Inst.size() < 7)
+    return false;
+
+  if (Inst.startswith("fldm") || Inst.startswith("fstm")) {
+    StringRef AddressingMode = Inst.substr(4, 2);
+    if (AddressingMode == "ia" || AddressingMode == "db" ||
+        AddressingMode == "ea" || AddressingMode == "fd") {
+      AcceptSinglePrecisionOnly = Inst[6] == 's';
+      AcceptDoublePrecisionOnly = Inst[6] == 'd' || Inst[6] == 'x';
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME: Can this be done via tablegen in some fashion?
+  bool RequireVFPRegisterListCheck;
+  bool AcceptSinglePrecisionOnly;
+  bool AcceptDoublePrecisionOnly;
+  RequireVFPRegisterListCheck =
+    RequiresVFPRegListValidation(Name, AcceptSinglePrecisionOnly,
+                                 AcceptDoublePrecisionOnly);
+
   // Apply mnemonic aliases before doing anything else, as the destination
-  // mnemnonic may include suffices and we want to handle them normally.
+  // mnemonic may include suffices and we want to handle them normally.
   // The generic tblgen'erated code does this later, at the start of
   // MatchInstructionImpl(), but that's too late for aliases that include
   // any sort of suffix.
@@ -5141,6 +5278,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     // For for ARM mode generate an error if the .n qualifier is used.
     if (ExtraToken == ".n" && !isThumb()) {
       SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Parser.eatToEndOfStatement();
       return Error(Loc, "instruction with .n (narrow) qualifier not allowed in "
                    "arm mode");
     }
@@ -5181,6 +5319,16 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   Parser.Lex(); // Consume the EndOfStatement
 
+  if (RequireVFPRegisterListCheck) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands.back());
+    if (AcceptSinglePrecisionOnly && !Op->isSPRRegList())
+      return Error(Op->getStartLoc(),
+                   "VFP/Neon single precision register expected");
+    if (AcceptDoublePrecisionOnly && !Op->isDPRRegList())
+      return Error(Op->getStartLoc(),
+                   "VFP/Neon double precision register expected");
+  }
+
   // Some instructions, mostly Thumb, have forms for the same mnemonic that
   // do and don't have a cc_out optional-def operand. With some spot-checks
   // of the operand list, we can figure out which variant we're trying to
@@ -5255,6 +5403,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
+  // GNU Assembler extension (compatibility)
+  if ((Mnemonic == "ldrd" || Mnemonic == "strd") && !isThumb() &&
+      Operands.size() == 4) {
+    ARMOperand *Op = static_cast<ARMOperand *>(Operands[2]);
+    assert(Op->isReg() && "expected register argument");
+
+    unsigned SuperReg = MRI->getMatchingSuperReg(
+        Op->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+
+    assert(SuperReg && "expected register pair");
+
+    unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
+
+    Operands.insert(Operands.begin() + 3,
+                    ARMOperand::CreateReg(PairedReg, Op->getStartLoc(),
+                                          Op->getEndLoc()));
+  }
+
   // FIXME: As said above, this is all a pretty gross hack.  This instruction
   // does not fit with other "subs" and tblgen.
   // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
@@ -5496,7 +5662,6 @@ validateInstruction(MCInst &Inst,
   case ARM::sysSTMIB_UPD:
     return Error(Operands[2]->getStartLoc(),
                  "system STM cannot have writeback register");
-    break;
   case ARM::tMUL: {
     // The second source operand must be the same register as the destination
     // operand.
@@ -5789,7 +5954,7 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
   case ARM::VLD4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
   case ARM::VLD4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
   case ARM::VLD4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
-  case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
   case ARM::VLD4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
   case ARM::VLD4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
   case ARM::VLD4LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
@@ -5848,6 +6013,42 @@ bool ARMAsmParser::
 processInstruction(MCInst &Inst,
                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   switch (Inst.getOpcode()) {
+  // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
+  case ARM::LDRT_POST:
+  case ARM::LDRBT_POST: {
+    const unsigned Opcode =
+      (Inst.getOpcode() == ARM::LDRT_POST) ? ARM::LDRT_POST_IMM
+                                           : ARM::LDRBT_POST_IMM;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction.
+  case ARM::STRT_POST:
+  case ARM::STRBT_POST: {
+    const unsigned Opcode =
+      (Inst.getOpcode() == ARM::STRT_POST) ? ARM::STRT_POST_IMM
+                                           : ARM::STRBT_POST_IMM;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode);
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
   // Alias for alternate form of 'ADR Rd, #imm' instruction.
   case ARM::ADDri: {
     if (Inst.getOperand(1).getReg() != ARM::PC ||
@@ -7659,6 +7860,10 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
+template<> inline bool IsCPSRDead<MCInst>(MCInst* Instr) {
+  return true; // In an assembly source, no need to second-guess
+}
+
 static const char *getSubtargetFeatureName(unsigned Val);
 bool ARMAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -7694,7 +7899,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
       // Only after the instruction is fully processed, we can validate it
       if (wasInITBlock && hasV8Ops() && isThumb() &&
-          !isV8EligibleForIT(&Inst, 2)) {
+          !isV8EligibleForIT(&Inst)) {
         Warning(IDLoc, "deprecated instruction in IT block");
       }
     }
@@ -7710,7 +7915,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return false;
 
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst);
+    Out.EmitInstruction(Inst, STI);
     return false;
   case Match_MissingFeature: {
     assert(ErrorInfo && "Unknown missing feature!");
@@ -7769,7 +7974,9 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
-    return parseDirectiveWord(4, DirectiveID.getLoc());
+    return parseLiteralValues(4, DirectiveID.getLoc());
+  else if (IDVal == ".short" || IDVal == ".hword")
+    return parseLiteralValues(2, DirectiveID.getLoc());
   else if (IDVal == ".thumb")
     return parseDirectiveThumb(DirectiveID.getLoc());
   else if (IDVal == ".arm")
@@ -7808,17 +8015,47 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveRegSave(DirectiveID.getLoc(), false);
   else if (IDVal == ".vsave")
     return parseDirectiveRegSave(DirectiveID.getLoc(), true);
+  else if (IDVal == ".inst")
+    return parseDirectiveInst(DirectiveID.getLoc());
+  else if (IDVal == ".inst.n")
+    return parseDirectiveInst(DirectiveID.getLoc(), 'n');
+  else if (IDVal == ".inst.w")
+    return parseDirectiveInst(DirectiveID.getLoc(), 'w');
+  else if (IDVal == ".ltorg" || IDVal == ".pool")
+    return parseDirectiveLtorg(DirectiveID.getLoc());
+  else if (IDVal == ".even")
+    return parseDirectiveEven(DirectiveID.getLoc());
+  else if (IDVal == ".personalityindex")
+    return parseDirectivePersonalityIndex(DirectiveID.getLoc());
+  else if (IDVal == ".unwind_raw")
+    return parseDirectiveUnwindRaw(DirectiveID.getLoc());
+  else if (IDVal == ".tlsdescseq")
+    return parseDirectiveTLSDescSeq(DirectiveID.getLoc());
+  else if (IDVal == ".movsp")
+    return parseDirectiveMovSP(DirectiveID.getLoc());
+  else if (IDVal == ".object_arch")
+    return parseDirectiveObjectArch(DirectiveID.getLoc());
+  else if (IDVal == ".arch_extension")
+    return parseDirectiveArchExtension(DirectiveID.getLoc());
+  else if (IDVal == ".align")
+    return parseDirectiveAlign(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_set")
+    return parseDirectiveThumbSet(DirectiveID.getLoc());
   return true;
 }
 
-/// parseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+/// parseLiteralValues
+///  ::= .hword expression [, expression]*
+///  ::= .short expression [, expression]*
+///  ::= .word expression [, expression]*
+bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
+      if (getParser().parseExpression(Value)) {
+        Parser.eatToEndOfStatement();
+        return false;
+      }
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -7826,8 +8063,10 @@ bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
       Parser.Lex();
     }
   }
@@ -7839,15 +8078,20 @@ bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
 /// parseDirectiveThumb
 ///  ::= .thumb
 bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(L, "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
   Parser.Lex();
 
-  if (!hasThumb())
-    return Error(L, "target does not support Thumb mode");
+  if (!hasThumb()) {
+    Error(L, "target does not support Thumb mode");
+    return false;
+  }
 
   if (!isThumb())
     SwitchMode();
+
   getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
   return false;
 }
@@ -7855,15 +8099,20 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
 /// parseDirectiveARM
 ///  ::= .arm
 bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(L, "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
   Parser.Lex();
 
-  if (!hasARM())
-    return Error(L, "target does not support ARM mode");
+  if (!hasARM()) {
+    Error(L, "target does not support ARM mode");
+    return false;
+  }
 
   if (isThumb())
     SwitchMode();
+
   getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
   return false;
 }
@@ -7872,6 +8121,32 @@ void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
   if (NextSymbolIsThumb) {
     getParser().getStreamer().EmitThumbFunc(Symbol);
     NextSymbolIsThumb = false;
+    return;
+  }
+
+  if (!isThumb())
+    return;
+
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  switch (Format) {
+  case MCObjectFileInfo::IsCOFF: {
+    const MCSymbolData &SD =
+      getParser().getStreamer().getOrCreateSymbolData(Symbol);
+    char Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+    if (SD.getFlags() & (Type << COFF::SF_TypeShift))
+      getParser().getStreamer().EmitThumbFunc(Symbol);
+    break;
+  }
+  case MCObjectFileInfo::IsELF: {
+    const MCSymbolData &SD =
+      getParser().getStreamer().getOrCreateSymbolData(Symbol);
+    if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift))
+      getParser().getStreamer().EmitThumbFunc(Symbol);
+    break;
+  }
+  case MCObjectFileInfo::IsMachO:
+    break;
   }
 }
 
@@ -7886,8 +8161,11 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
   if (isMachO) {
     const AsmToken &Tok = Parser.getTok();
     if (Tok.isNot(AsmToken::EndOfStatement)) {
-      if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
-        return Error(L, "unexpected token in .thumb_func directive");
+      if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) {
+        Error(L, "unexpected token in .thumb_func directive");
+        return false;
+      }
+
       MCSymbol *Func =
           getParser().getContext().GetOrCreateSymbol(Tok.getIdentifier());
       getParser().getStreamer().EmitThumbFunc(Func);
@@ -7896,11 +8174,12 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
     }
   }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(L, "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
 
   NextSymbolIsThumb = true;
-
   return false;
 }
 
@@ -7908,18 +8187,26 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
 ///  ::= .syntax unified | divided
 bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier))
-    return Error(L, "unexpected token in .syntax directive");
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(L, "unexpected token in .syntax directive");
+    return false;
+  }
+
   StringRef Mode = Tok.getString();
-  if (Mode == "unified" || Mode == "UNIFIED")
+  if (Mode == "unified" || Mode == "UNIFIED") {
     Parser.Lex();
-  else if (Mode == "divided" || Mode == "DIVIDED")
-    return Error(L, "'.syntax divided' arm asssembly not supported");
-  else
-    return Error(L, "unrecognized syntax mode in .syntax directive");
+  } else if (Mode == "divided" || Mode == "DIVIDED") {
+    Error(L, "'.syntax divided' arm asssembly not supported");
+    return false;
+  } else {
+    Error(L, "unrecognized syntax mode in .syntax directive");
+    return false;
+  }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(Parser.getTok().getLoc(), "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in directive");
+    return false;
+  }
   Parser.Lex();
 
   // TODO tell the MC streamer the mode
@@ -7931,30 +8218,37 @@ bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
 ///  ::= .code 16 | 32
 bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Integer))
-    return Error(L, "unexpected token in .code directive");
+  if (Tok.isNot(AsmToken::Integer)) {
+    Error(L, "unexpected token in .code directive");
+    return false;
+  }
   int64_t Val = Parser.getTok().getIntVal();
-  if (Val == 16)
-    Parser.Lex();
-  else if (Val == 32)
-    Parser.Lex();
-  else
-    return Error(L, "invalid operand to .code directive");
+  if (Val != 16 && Val != 32) {
+    Error(L, "invalid operand to .code directive");
+    return false;
+  }
+  Parser.Lex();
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(Parser.getTok().getLoc(), "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in directive");
+    return false;
+  }
   Parser.Lex();
 
   if (Val == 16) {
-    if (!hasThumb())
-      return Error(L, "target does not support Thumb mode");
+    if (!hasThumb()) {
+      Error(L, "target does not support Thumb mode");
+      return false;
+    }
 
     if (!isThumb())
       SwitchMode();
     getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
   } else {
-    if (!hasARM())
-      return Error(L, "target does not support ARM mode");
+    if (!hasARM()) {
+      Error(L, "target does not support ARM mode");
+      return false;
+    }
 
     if (isThumb())
       SwitchMode();
@@ -7972,21 +8266,23 @@ bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   SMLoc SRegLoc, ERegLoc;
   if (ParseRegister(Reg, SRegLoc, ERegLoc)) {
     Parser.eatToEndOfStatement();
-    return Error(SRegLoc, "register name expected");
+    Error(SRegLoc, "register name expected");
+    return false;
   }
 
   // Shouldn't be anything else.
   if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
     Parser.eatToEndOfStatement();
-    return Error(Parser.getTok().getLoc(),
-                 "unexpected input in .req directive.");
+    Error(Parser.getTok().getLoc(), "unexpected input in .req directive.");
+    return false;
   }
 
   Parser.Lex(); // Consume the EndOfStatement
 
-  if (RegisterReqs.GetOrCreateValue(Name, Reg).getValue() != Reg)
-    return Error(SRegLoc, "redefinition of '" + Name +
-                          "' does not match original.");
+  if (RegisterReqs.GetOrCreateValue(Name, Reg).getValue() != Reg) {
+    Error(SRegLoc, "redefinition of '" + Name + "' does not match original.");
+    return false;
+  }
 
   return false;
 }
@@ -7996,9 +8292,10 @@ bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
 bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     Parser.eatToEndOfStatement();
-    return Error(L, "unexpected input in .unreq directive.");
+    Error(L, "unexpected input in .unreq directive.");
+    return false;
   }
-  RegisterReqs.erase(Parser.getTok().getIdentifier());
+  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
   Parser.Lex(); // Eat the identifier.
   return false;
 }
@@ -8006,34 +8303,158 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
 /// parseDirectiveArch
 ///  ::= .arch token
 bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
-  return true;
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".arch directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Arch = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = StringSwitch<unsigned>(Arch)
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+    .Case(NAME, ARM::ID)
+#define ARM_ARCH_ALIAS(NAME, ID) \
+    .Case(NAME, ARM::ID)
+#include "MCTargetDesc/ARMArchName.def"
+    .Default(ARM::INVALID_ARCH);
+
+  if (ID == ARM::INVALID_ARCH) {
+    Error(L, "Unknown arch name");
+    return false;
+  }
+
+  getTargetStreamer().emitArch(ID);
+  return false;
 }
 
 /// parseDirectiveEabiAttr
-///  ::= .eabi_attribute int, int
+///  ::= .eabi_attribute int, int [, "str"]
+///  ::= .eabi_attribute Tag_name, int [, "str"]
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
-  if (Parser.getTok().isNot(AsmToken::Integer))
-    return Error(L, "integer expected");
-  int64_t Tag = Parser.getTok().getIntVal();
-  Parser.Lex(); // eat tag integer
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".eabi_attribute directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
 
-  if (Parser.getTok().isNot(AsmToken::Comma))
-    return Error(L, "comma expected");
+  int64_t Tag;
+  SMLoc TagLoc;
+  TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    Tag = ARMBuildAttrs::AttrTypeFromString(Name);
+    if (Tag == -1) {
+      Error(TagLoc, "attribute name not recognised: " + Name);
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr)) {
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (!CE) {
+      Error(TagLoc, "expected numeric constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "comma expected");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
   Parser.Lex(); // skip comma
 
-  L = Parser.getTok().getLoc();
-  if (Parser.getTok().isNot(AsmToken::Integer))
-    return Error(L, "integer expected");
-  int64_t Value = Parser.getTok().getIntVal();
-  Parser.Lex(); // eat value integer
+  StringRef StringValue = "";
+  bool IsStringValue = false;
+
+  int64_t IntegerValue = 0;
+  bool IsIntegerValue = false;
+
+  if (Tag == ARMBuildAttrs::CPU_raw_name || Tag == ARMBuildAttrs::CPU_name)
+    IsStringValue = true;
+  else if (Tag == ARMBuildAttrs::compatibility) {
+    IsStringValue = true;
+    IsIntegerValue = true;
+  } else if (Tag < 32 || Tag % 2 == 0)
+    IsIntegerValue = true;
+  else if (Tag % 2 == 1)
+    IsStringValue = true;
+  else
+    llvm_unreachable("invalid tag type");
 
-  getTargetStreamer().emitAttribute(Tag, Value);
+  if (IsIntegerValue) {
+    const MCExpr *ValueExpr;
+    SMLoc ValueExprLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(ValueExpr)) {
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+    if (!CE) {
+      Error(ValueExprLoc, "expected numeric constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    IntegerValue = CE->getValue();
+  }
+
+  if (Tag == ARMBuildAttrs::compatibility) {
+    if (Parser.getTok().isNot(AsmToken::Comma))
+      IsStringValue = false;
+    else
+      Parser.Lex();
+  }
+
+  if (IsStringValue) {
+    if (Parser.getTok().isNot(AsmToken::String)) {
+      Error(Parser.getTok().getLoc(), "bad string constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    StringValue = Parser.getTok().getStringContents();
+    Parser.Lex();
+  }
+
+  if (IsIntegerValue && IsStringValue) {
+    assert(Tag == ARMBuildAttrs::compatibility);
+    getTargetStreamer().emitIntTextAttribute(Tag, IntegerValue, StringValue);
+  } else if (IsIntegerValue)
+    getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  else if (IsStringValue)
+    getTargetStreamer().emitTextAttribute(Tag, StringValue);
   return false;
 }
 
 /// parseDirectiveCPU
 ///  ::= .cpu str
 bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".cpu directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
   StringRef CPU = getParser().parseStringToEndOfStatement().trim();
   getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
   return false;
@@ -8042,6 +8463,14 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
 /// parseDirectiveFPU
 ///  ::= .fpu str
 bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".fpu directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
   StringRef FPU = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = StringSwitch<unsigned>(FPU)
@@ -8049,8 +8478,10 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
 #include "ARMFPUName.def"
     .Default(ARM::INVALID_FPU);
 
-  if (ID == ARM::INVALID_FPU)
-    return Error(L, "Unknown FPU name");
+  if (ID == ARM::INVALID_FPU) {
+    Error(L, "Unknown FPU name");
+    return false;
+  }
 
   getTargetStreamer().emitFPU(ID);
   return false;
@@ -8059,14 +8490,26 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
 /// parseDirectiveFnStart
 ///  ::= .fnstart
 bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
-  if (FnStartLoc.isValid()) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".fnstart directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (UC.hasFnStart()) {
     Error(L, ".fnstart starts before the end of previous one");
-    Error(FnStartLoc, "previous .fnstart starts here");
-    return true;
+    UC.emitFnStartLocNotes();
+    return false;
   }
 
-  FnStartLoc = L;
+  // Reset the unwind directives parser state
+  UC.reset();
+
   getTargetStreamer().emitFnStart();
+
+  UC.recordFnStart(L);
   return false;
 }
 
@@ -8074,31 +8517,37 @@ bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
 ///  ::= .fnend
 bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
   // Check the ordering of unwind directives
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .fnend directive");
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .fnend directive");
+    return false;
+  }
 
   // Reset the unwind directives parser state
-  resetUnwindDirectiveParserState();
   getTargetStreamer().emitFnEnd();
+
+  UC.reset();
   return false;
 }
 
 /// parseDirectiveCantUnwind
 ///  ::= .cantunwind
 bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
+  UC.recordCantUnwind(L);
+
   // Check the ordering of unwind directives
-  CantUnwindLoc = L;
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .cantunwind directive");
-  if (HandlerDataLoc.isValid()) {
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .cantunwind directive");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
     Error(L, ".cantunwind can't be used with .handlerdata directive");
-    Error(HandlerDataLoc, ".handlerdata was specified here");
-    return true;
+    UC.emitHandlerDataLocNotes();
+    return false;
   }
-  if (PersonalityLoc.isValid()) {
+  if (UC.hasPersonality()) {
     Error(L, ".cantunwind can't be used with .personality directive");
-    Error(PersonalityLoc, ".personality was specified here");
-    return true;
+    UC.emitPersonalityLocNotes();
+    return false;
   }
 
   getTargetStreamer().emitCantUnwind();
@@ -8108,25 +8557,37 @@ bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
 /// parseDirectivePersonality
 ///  ::= .personality name
 bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+  bool HasExistingPersonality = UC.hasPersonality();
+
+  UC.recordPersonality(L);
+
   // Check the ordering of unwind directives
-  PersonalityLoc = L;
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .personality directive");
-  if (CantUnwindLoc.isValid()) {
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .personality directive");
+    return false;
+  }
+  if (UC.cantUnwind()) {
     Error(L, ".personality can't be used with .cantunwind directive");
-    Error(CantUnwindLoc, ".cantunwind was specified here");
-    return true;
+    UC.emitCantUnwindLocNotes();
+    return false;
   }
-  if (HandlerDataLoc.isValid()) {
+  if (UC.hasHandlerData()) {
     Error(L, ".personality must precede .handlerdata directive");
-    Error(HandlerDataLoc, ".handlerdata was specified here");
-    return true;
+    UC.emitHandlerDataLocNotes();
+    return false;
+  }
+  if (HasExistingPersonality) {
+    Parser.eatToEndOfStatement();
+    Error(L, "multiple personality directives");
+    UC.emitPersonalityLocNotes();
+    return false;
   }
 
   // Parse the name of the personality routine
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     Parser.eatToEndOfStatement();
-    return Error(L, "unexpected input in .personality directive.");
+    Error(L, "unexpected input in .personality directive.");
+    return false;
   }
   StringRef Name(Parser.getTok().getIdentifier());
   Parser.Lex();
@@ -8139,14 +8600,17 @@ bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
 /// parseDirectiveHandlerData
 ///  ::= .handlerdata
 bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
+  UC.recordHandlerData(L);
+
   // Check the ordering of unwind directives
-  HandlerDataLoc = L;
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .personality directive");
-  if (CantUnwindLoc.isValid()) {
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .personality directive");
+    return false;
+  }
+  if (UC.cantUnwind()) {
     Error(L, ".handlerdata can't be used with .cantunwind directive");
-    Error(CantUnwindLoc, ".cantunwind was specified here");
-    return true;
+    UC.emitCantUnwindLocNotes();
+    return false;
   }
 
   getTargetStreamer().emitHandlerData();
@@ -8157,34 +8621,45 @@ bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
 ///  ::= .setfp fpreg, spreg [, offset]
 bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
   // Check the ordering of unwind directives
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .setfp directive");
-  if (HandlerDataLoc.isValid())
-    return Error(L, ".setfp must precede .handlerdata directive");
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .setfp directive");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".setfp must precede .handlerdata directive");
+    return false;
+  }
 
   // Parse fpreg
-  SMLoc NewFPRegLoc = Parser.getTok().getLoc();
-  int NewFPReg = tryParseRegister();
-  if (NewFPReg == -1)
-    return Error(NewFPRegLoc, "frame pointer register expected");
+  SMLoc FPRegLoc = Parser.getTok().getLoc();
+  int FPReg = tryParseRegister();
+  if (FPReg == -1) {
+    Error(FPRegLoc, "frame pointer register expected");
+    return false;
+  }
 
   // Consume comma
-  if (!Parser.getTok().is(AsmToken::Comma))
-    return Error(Parser.getTok().getLoc(), "comma expected");
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "comma expected");
+    return false;
+  }
   Parser.Lex(); // skip comma
 
   // Parse spreg
-  SMLoc NewSPRegLoc = Parser.getTok().getLoc();
-  int NewSPReg = tryParseRegister();
-  if (NewSPReg == -1)
-    return Error(NewSPRegLoc, "stack pointer register expected");
+  SMLoc SPRegLoc = Parser.getTok().getLoc();
+  int SPReg = tryParseRegister();
+  if (SPReg == -1) {
+    Error(SPRegLoc, "stack pointer register expected");
+    return false;
+  }
 
-  if (NewSPReg != ARM::SP && NewSPReg != FPReg)
-    return Error(NewSPRegLoc,
-                 "register should be either $sp or the latest fp register");
+  if (SPReg != ARM::SP && SPReg != UC.getFPReg()) {
+    Error(SPRegLoc, "register should be either $sp or the latest fp register");
+    return false;
+  }
 
   // Update the frame pointer register
-  FPReg = NewFPReg;
+  UC.saveFPReg(FPReg);
 
   // Parse offset
   int64_t Offset = 0;
@@ -8193,24 +8668,29 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
 
     if (Parser.getTok().isNot(AsmToken::Hash) &&
         Parser.getTok().isNot(AsmToken::Dollar)) {
-      return Error(Parser.getTok().getLoc(), "'#' expected");
+      Error(Parser.getTok().getLoc(), "'#' expected");
+      return false;
     }
     Parser.Lex(); // skip hash token.
 
     const MCExpr *OffsetExpr;
     SMLoc ExLoc = Parser.getTok().getLoc();
     SMLoc EndLoc;
-    if (getParser().parseExpression(OffsetExpr, EndLoc))
-      return Error(ExLoc, "malformed setfp offset");
+    if (getParser().parseExpression(OffsetExpr, EndLoc)) {
+      Error(ExLoc, "malformed setfp offset");
+      return false;
+    }
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
-    if (!CE)
-      return Error(ExLoc, "setfp offset must be an immediate");
+    if (!CE) {
+      Error(ExLoc, "setfp offset must be an immediate");
+      return false;
+    }
 
     Offset = CE->getValue();
   }
 
-  getTargetStreamer().emitSetFP(static_cast<unsigned>(NewFPReg),
-                                static_cast<unsigned>(NewSPReg), Offset);
+  getTargetStreamer().emitSetFP(static_cast<unsigned>(FPReg),
+                                static_cast<unsigned>(SPReg), Offset);
   return false;
 }
 
@@ -8218,26 +8698,35 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
 ///  ::= .pad offset
 bool ARMAsmParser::parseDirectivePad(SMLoc L) {
   // Check the ordering of unwind directives
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .pad directive");
-  if (HandlerDataLoc.isValid())
-    return Error(L, ".pad must precede .handlerdata directive");
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .pad directive");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".pad must precede .handlerdata directive");
+    return false;
+  }
 
   // Parse the offset
   if (Parser.getTok().isNot(AsmToken::Hash) &&
       Parser.getTok().isNot(AsmToken::Dollar)) {
-    return Error(Parser.getTok().getLoc(), "'#' expected");
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return false;
   }
   Parser.Lex(); // skip hash token.
 
   const MCExpr *OffsetExpr;
   SMLoc ExLoc = Parser.getTok().getLoc();
   SMLoc EndLoc;
-  if (getParser().parseExpression(OffsetExpr, EndLoc))
-    return Error(ExLoc, "malformed pad offset");
+  if (getParser().parseExpression(OffsetExpr, EndLoc)) {
+    Error(ExLoc, "malformed pad offset");
+    return false;
+  }
   const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
-  if (!CE)
-    return Error(ExLoc, "pad offset must be an immediate");
+  if (!CE) {
+    Error(ExLoc, "pad offset must be an immediate");
+    return false;
+  }
 
   getTargetStreamer().emitPad(CE->getValue());
   return false;
@@ -8248,10 +8737,14 @@ bool ARMAsmParser::parseDirectivePad(SMLoc L) {
 ///  ::= .vsave { registers }
 bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   // Check the ordering of unwind directives
-  if (!FnStartLoc.isValid())
-    return Error(L, ".fnstart must precede .save or .vsave directives");
-  if (HandlerDataLoc.isValid())
-    return Error(L, ".save or .vsave must precede .handlerdata directive");
+  if (!UC.hasFnStart()) {
+    Error(L, ".fnstart must precede .save or .vsave directives");
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".save or .vsave must precede .handlerdata directive");
+    return false;
+  }
 
   // RAII object to make sure parsed operands are deleted.
   struct CleanupObject {
@@ -8264,21 +8757,509 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
 
   // Parse the register list
   if (parseRegisterList(CO.Operands))
-    return true;
+    return false;
   ARMOperand *Op = (ARMOperand*)CO.Operands[0];
-  if (!IsVector && !Op->isRegList())
-    return Error(L, ".save expects GPR registers");
-  if (IsVector && !Op->isDPRRegList())
-    return Error(L, ".vsave expects DPR registers");
+  if (!IsVector && !Op->isRegList()) {
+    Error(L, ".save expects GPR registers");
+    return false;
+  }
+  if (IsVector && !Op->isDPRRegList()) {
+    Error(L, ".vsave expects DPR registers");
+    return false;
+  }
 
   getTargetStreamer().emitRegSave(Op->getRegList(), IsVector);
   return false;
 }
 
+/// parseDirectiveInst
+///  ::= .inst opcode [, ...]
+///  ::= .inst.n opcode [, ...]
+///  ::= .inst.w opcode [, ...]
+bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(Loc, ".inst directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  int Width;
+
+  if (isThumb()) {
+    switch (Suffix) {
+    case 'n':
+      Width = 2;
+      break;
+    case 'w':
+      Width = 4;
+      break;
+    default:
+      Parser.eatToEndOfStatement();
+      Error(Loc, "cannot determine Thumb instruction size, "
+                 "use inst.n/inst.w instead");
+      return false;
+    }
+  } else {
+    if (Suffix) {
+      Parser.eatToEndOfStatement();
+      Error(Loc, "width suffixes are invalid in ARM mode");
+      return false;
+    }
+    Width = 4;
+  }
+
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    Error(Loc, "expected expression following directive");
+    return false;
+  }
+
+  for (;;) {
+    const MCExpr *Expr;
+
+    if (getParser().parseExpression(Expr)) {
+      Error(Loc, "expected expression");
+      return false;
+    }
+
+    const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+    if (!Value) {
+      Error(Loc, "expected constant expression");
+      return false;
+    }
+
+    switch (Width) {
+    case 2:
+      if (Value->getValue() > 0xffff) {
+        Error(Loc, "inst.n operand is too big, use inst.w instead");
+        return false;
+      }
+      break;
+    case 4:
+      if (Value->getValue() > 0xffffffff) {
+        Error(Loc,
+              StringRef(Suffix ? "inst.w" : "inst") + " operand is too big");
+        return false;
+      }
+      break;
+    default:
+      llvm_unreachable("only supported widths are 2 and 4");
+    }
+
+    getTargetStreamer().emitInst(Value->getValue(), Suffix);
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      Error(Loc, "unexpected token in directive");
+      return false;
+    }
+
+    Parser.Lex();
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveLtorg
+///  ::= .ltorg | .pool
+bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) {
+  getTargetStreamer().emitCurrentConstantPool();
+  return false;
+}
+
+bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
+  const MCSection *Section = getStreamer().getCurrentSection().first;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token in directive");
+    return false;
+  }
+
+  if (!Section) {
+    getStreamer().InitSections();
+    Section = getStreamer().getCurrentSection().first;
+  }
+
+  assert(Section && "must have section to emit alignment");
+  if (Section->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(2);
+  else
+    getStreamer().EmitValueToAlignment(2);
+
+  return false;
+}
+
+/// parseDirectivePersonalityIndex
+///   ::= .personalityindex index
+bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) {
+  bool HasExistingPersonality = UC.hasPersonality();
+
+  UC.recordPersonalityIndex(L);
+
+  if (!UC.hasFnStart()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".fnstart must precede .personalityindex directive");
+    return false;
+  }
+  if (UC.cantUnwind()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".personalityindex cannot be used with .cantunwind");
+    UC.emitCantUnwindLocNotes();
+    return false;
+  }
+  if (UC.hasHandlerData()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".personalityindex must precede .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return false;
+  }
+  if (HasExistingPersonality) {
+    Parser.eatToEndOfStatement();
+    Error(L, "multiple personality directives");
+    UC.emitPersonalityLocNotes();
+    return false;
+  }
+
+  const MCExpr *IndexExpression;
+  SMLoc IndexLoc = Parser.getTok().getLoc();
+  if (Parser.parseExpression(IndexExpression)) {
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IndexExpression);
+  if (!CE) {
+    Parser.eatToEndOfStatement();
+    Error(IndexLoc, "index must be a constant number");
+    return false;
+  }
+  if (CE->getValue() < 0 ||
+      CE->getValue() >= ARM::EHABI::NUM_PERSONALITY_INDEX) {
+    Parser.eatToEndOfStatement();
+    Error(IndexLoc, "personality routine index should be in range [0-3]");
+    return false;
+  }
+
+  getTargetStreamer().emitPersonalityIndex(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveUnwindRaw
+///   ::= .unwind_raw offset, opcode [, opcode...]
+bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
+  if (!UC.hasFnStart()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".fnstart must precede .unwind_raw directives");
+    return false;
+  }
+
+  int64_t StackOffset;
+
+  const MCExpr *OffsetExpr;
+  SMLoc OffsetLoc = getLexer().getLoc();
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      getParser().parseExpression(OffsetExpr)) {
+    Error(OffsetLoc, "expected expression");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE) {
+    Error(OffsetLoc, "offset must be a constant");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StackOffset = CE->getValue();
+
+  if (getLexer().isNot(AsmToken::Comma)) {
+    Error(getLexer().getLoc(), "expected comma");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Parser.Lex();
+
+  SmallVector<uint8_t, 16> Opcodes;
+  for (;;) {
+    const MCExpr *OE;
+
+    SMLoc OpcodeLoc = getLexer().getLoc();
+    if (getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE)) {
+      Error(OpcodeLoc, "expected opcode expression");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const MCConstantExpr *OC = dyn_cast<MCConstantExpr>(OE);
+    if (!OC) {
+      Error(OpcodeLoc, "opcode value must be a constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    const int64_t Opcode = OC->getValue();
+    if (Opcode & ~0xff) {
+      Error(OpcodeLoc, "invalid opcode");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    Opcodes.push_back(uint8_t(Opcode));
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      Error(getLexer().getLoc(), "unexpected token in directive");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+
+    Parser.Lex();
+  }
+
+  getTargetStreamer().emitUnwindRaw(StackOffset, Opcodes);
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveTLSDescSeq
+///   ::= .tlsdescseq tls-variable
+bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".tlsdescseq directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    TokError("expected variable after '.tlsdescseq' directive");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  const MCSymbolRefExpr *SRE =
+    MCSymbolRefExpr::Create(Parser.getTok().getIdentifier(),
+                            MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext());
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  getTargetStreamer().AnnotateTLSDescriptorSequence(SRE);
+  return false;
+}
+
+/// parseDirectiveMovSP
+///  ::= .movsp reg [, #offset]
+bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
+  if (!UC.hasFnStart()) {
+    Parser.eatToEndOfStatement();
+    Error(L, ".fnstart must precede .movsp directives");
+    return false;
+  }
+  if (UC.getFPReg() != ARM::SP) {
+    Parser.eatToEndOfStatement();
+    Error(L, "unexpected .movsp directive");
+    return false;
+  }
+
+  SMLoc SPRegLoc = Parser.getTok().getLoc();
+  int SPReg = tryParseRegister();
+  if (SPReg == -1) {
+    Parser.eatToEndOfStatement();
+    Error(SPRegLoc, "register expected");
+    return false;
+  }
+
+  if (SPReg == ARM::SP || SPReg == ARM::PC) {
+    Parser.eatToEndOfStatement();
+    Error(SPRegLoc, "sp and pc are not permitted in .movsp directive");
+    return false;
+  }
+
+  int64_t Offset = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Hash)) {
+      Error(Parser.getTok().getLoc(), "expected #constant");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Parser.Lex();
+
+    const MCExpr *OffsetExpr;
+    SMLoc OffsetLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(OffsetExpr)) {
+      Parser.eatToEndOfStatement();
+      Error(OffsetLoc, "malformed offset expression");
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (!CE) {
+      Parser.eatToEndOfStatement();
+      Error(OffsetLoc, "offset must be an immediate constant");
+      return false;
+    }
+
+    Offset = CE->getValue();
+  }
+
+  getTargetStreamer().emitMovSP(SPReg, Offset);
+  UC.saveFPReg(SPReg);
+
+  return false;
+}
+
+/// parseDirectiveObjectArch
+///   ::= .object_arch name
+bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  if (isMachO) {
+    Error(L, ".object_arch directive not valid for Mach-O");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(getLexer().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Arch = Parser.getTok().getString();
+  SMLoc ArchLoc = Parser.getTok().getLoc();
+  getLexer().Lex();
+
+  unsigned ID = StringSwitch<unsigned>(Arch)
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+    .Case(NAME, ARM::ID)
+#define ARM_ARCH_ALIAS(NAME, ID) \
+    .Case(NAME, ARM::ID)
+#include "MCTargetDesc/ARMArchName.def"
+#undef ARM_ARCH_NAME
+#undef ARM_ARCH_ALIAS
+    .Default(ARM::INVALID_ARCH);
+
+  if (ID == ARM::INVALID_ARCH) {
+    Error(ArchLoc, "unknown architecture '" + Arch + "'");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  getTargetStreamer().emitObjectArch(ID);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(getLexer().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+  }
+
+  return false;
+}
+
+/// parseDirectiveAlign
+///   ::= .align
+bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
+  // NOTE: if this is not the end of the statement, fall back to the target
+  // agnostic handling for this directive which will correctly handle this.
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return true;
+
+  // '.align' is target specifically handled to mean 2**2 byte alignment.
+  if (getStreamer().getCurrentSection().first->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(4, 0);
+  else
+    getStreamer().EmitValueToAlignment(4, 0, 1, 0);
+
+  return false;
+}
+
+/// parseDirectiveThumbSet
+///  ::= .thumb_set name, value
+bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
+  StringRef Name;
+  if (Parser.parseIdentifier(Name)) {
+    TokError("expected identifier after '.thumb_set'");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma)) {
+    TokError("expected comma after name '" + Name + "'");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Lex();
+
+  const MCExpr *Value;
+  if (Parser.parseExpression(Value)) {
+    TokError("missing expression");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  Lex();
+
+  MCSymbol *Alias = getContext().GetOrCreateSymbol(Name);
+  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
+    MCSymbol *Sym = getContext().LookupSymbol(SRE->getSymbol().getName());
+    if (!Sym->isDefined()) {
+      getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+      getStreamer().EmitAssignment(Alias, Value);
+      return false;
+    }
+
+    const MCObjectFileInfo::Environment Format =
+      getContext().getObjectFileInfo()->getObjectFileType();
+    switch (Format) {
+    case MCObjectFileInfo::IsCOFF: {
+      char Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+      getStreamer().EmitCOFFSymbolType(Type);
+      // .set values are always local in COFF
+      getStreamer().EmitSymbolAttribute(Alias, MCSA_Local);
+      break;
+    }
+    case MCObjectFileInfo::IsELF:
+      getStreamer().EmitSymbolAttribute(Alias, MCSA_ELF_TypeFunction);
+      break;
+    case MCObjectFileInfo::IsMachO:
+      break;
+    }
+  }
+
+  // FIXME: set the function as being a thumb function via the assembler
+  getStreamer().EmitThumbFunc(Alias);
+  getStreamer().EmitAssignment(Alias, Value);
+
+  return false;
+}
+
 /// Force static initialization.
 extern "C" void LLVMInitializeARMAsmParser() {
-  RegisterMCAsmParser<ARMAsmParser> X(TheARMTarget);
-  RegisterMCAsmParser<ARMAsmParser> Y(TheThumbTarget);
+  RegisterMCAsmParser<ARMAsmParser> X(TheARMLETarget);
+  RegisterMCAsmParser<ARMAsmParser> Y(TheARMBETarget);
+  RegisterMCAsmParser<ARMAsmParser> A(TheThumbLETarget);
+  RegisterMCAsmParser<ARMAsmParser> B(TheThumbBETarget);
 }
 
 #define GET_REGISTER_MATCHER
@@ -8286,6 +9267,82 @@ extern "C" void LLVMInitializeARMAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
 
+static const struct ExtMapEntry {
+  const char *Extension;
+  const unsigned ArchCheck;
+  const uint64_t Features;
+} Extensions[] = {
+  { "crc", Feature_HasV8, ARM::FeatureCRC },
+  { "crypto",  Feature_HasV8,
+    ARM::FeatureCrypto | ARM::FeatureNEON | ARM::FeatureFPARMv8 },
+  { "fp", Feature_HasV8, ARM::FeatureFPARMv8 },
+  { "idiv", Feature_HasV7 | Feature_IsNotMClass,
+    ARM::FeatureHWDiv | ARM::FeatureHWDivARM },
+  // FIXME: iWMMXT not supported
+  { "iwmmxt", Feature_None, 0 },
+  // FIXME: iWMMXT2 not supported
+  { "iwmmxt2", Feature_None, 0 },
+  // FIXME: Maverick not supported
+  { "maverick", Feature_None, 0 },
+  { "mp", Feature_HasV7 | Feature_IsNotMClass, ARM::FeatureMP },
+  // FIXME: ARMv6-m OS Extensions feature not checked
+  { "os", Feature_None, 0 },
+  // FIXME: Also available in ARMv6-K
+  { "sec", Feature_HasV7, ARM::FeatureTrustZone },
+  { "simd", Feature_HasV8, ARM::FeatureNEON | ARM::FeatureFPARMv8 },
+  // FIXME: Only available in A-class, isel not predicated
+  { "virt", Feature_HasV7, ARM::FeatureVirtualization },
+  // FIXME: xscale not supported
+  { "xscale", Feature_None, 0 },
+};
+
+/// parseDirectiveArchExtension
+///   ::= .arch_extension [no]feature
+bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(getLexer().getLoc(), "unexpected token");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Extension = Parser.getTok().getString();
+  SMLoc ExtLoc = Parser.getTok().getLoc();
+  getLexer().Lex();
+
+  bool EnableFeature = true;
+  if (Extension.startswith_lower("no")) {
+    EnableFeature = false;
+    Extension = Extension.substr(2);
+  }
+
+  for (unsigned EI = 0, EE = array_lengthof(Extensions); EI != EE; ++EI) {
+    if (Extensions[EI].Extension != Extension)
+      continue;
+
+    unsigned FB = getAvailableFeatures();
+    if ((FB & Extensions[EI].ArchCheck) != Extensions[EI].ArchCheck) {
+      Error(ExtLoc, "architectural extension '" + Extension + "' is not "
+            "allowed for the current base architecture");
+      return false;
+    }
+
+    if (!Extensions[EI].Features)
+      report_fatal_error("unsupported architectural extension: " + Extension);
+
+    if (EnableFeature)
+      FB |= ComputeAvailableFeatures(Extensions[EI].Features);
+    else
+      FB &= ~ComputeAvailableFeatures(Extensions[EI].Features);
+
+    setAvailableFeatures(FB);
+    return false;
+  }
+
+  Error(ExtLoc, "unknown architectural extension: " + Extension);
+  Parser.eatToEndOfStatement();
+  return false;
+}
+
 // Define this matcher function after the auto-generated include so we
 // have the match class enum definitions.
 unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
@@ -8294,12 +9351,29 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
   // If the kind is a token for a literal immediate, check if our asm
   // operand matches. This is for InstAliases which have a fixed-value
   // immediate in the syntax.
-  if (Kind == MCK__35_0 && Op->isImm()) {
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-    if (!CE)
-      return Match_InvalidOperand;
-    if (CE->getValue() == 0)
+  switch (Kind) {
+  default: break;
+  case MCK__35_0:
+    if (Op->isImm())
+      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()))
+        if (CE->getValue() == 0)
+          return Match_Success;
+    break;
+  case MCK_ARMSOImm:
+    if (Op->isImm()) {
+      const MCExpr *SOExpr = Op->getImm();
+      int64_t Value;
+      if (!SOExpr->EvaluateAsAbsolute(Value))
+        return Match_Success;
+      assert((Value >= INT32_MIN && Value <= INT32_MAX) &&
+             "expression value must be representiable in 32 bits");
+    }
+    break;
+  case MCK_GPRPair:
+    if (Op->isReg() &&
+        MRI->getRegClass(ARM::GPRRegClassID).contains(Op->getReg()))
       return Match_Success;
+    break;
   }
   return Match_InvalidOperand;
 }
diff --git a/lib/Target/ARM/AsmParser/Android.mk b/lib/Target/ARM/AsmParser/Android.mk
index e17a274..5f88cc3 100644
--- a/lib/Target/ARM/AsmParser/Android.mk
+++ b/lib/Target/ARM/AsmParser/Android.mk
@@ -46,6 +46,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 #===---------------------------------------------------------------===
 # libARMAsmParser (target)
 #===---------------------------------------------------------------===
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -59,3 +60,4 @@ TBLGEN_TD_DIR := $(arm_asm_parser_TBLGEN_TD_DIR)
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
index d2012c3..66ed1df 100644
--- a/lib/Target/ARM/AsmParser/CMakeLists.txt
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMARMAsmParser
   ARMAsmParser.cpp
   )
-
-add_dependencies(LLVMARMAsmParser ARMCommonTableGen)
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index f271a93..8e14883 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -17,6 +17,7 @@ add_public_tablegen_target(ARMCommonTableGen)
 add_llvm_target(ARMCodeGen
   A15SDOptimizer.cpp
   ARMAsmPrinter.cpp
+  ARMAtomicExpandPass.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
   ARMCodeEmitter.cpp
@@ -34,6 +35,7 @@ add_llvm_target(ARMCodeGen
   ARMMCInstLower.cpp
   ARMMachineFunctionInfo.cpp
   ARMRegisterInfo.cpp
+  ARMOptimizeBarriersPass.cpp
   ARMSelectionDAGInfo.cpp
   ARMSubtarget.cpp
   ARMTargetMachine.cpp
@@ -49,16 +51,6 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
-add_dependencies(LLVMARMCodeGen ARMCommonTableGen intrinsics_gen)
-
-# workaround for hanging compilation on MSVC9, 10
-if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
-set_property(
-  SOURCE ARMISelLowering.cpp
-  PROPERTY COMPILE_FLAGS "/Od"
-  )
-endif()
-
 add_subdirectory(TargetInfo)
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 9c7988f..9e40381 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -98,12 +98,10 @@ public:
   }
 
   /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
                               raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+                              raw_ostream &cStream) const override;
 };
 
 /// ThumbDisassembler - Thumb disassembler for all Thumb platforms.
@@ -119,12 +117,10 @@ public:
   }
 
   /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
                               raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+                              raw_ostream &cStream) const override;
 
 private:
   mutable ITStatus ITBlock;
@@ -860,9 +856,13 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
 
 extern "C" void LLVMInitializeARMDisassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
+  TargetRegistry::RegisterMCDisassembler(TheARMLETarget,
                                          createARMDisassembler);
-  TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
+  TargetRegistry::RegisterMCDisassembler(TheARMBETarget,
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbLETarget,
+                                         createThumbDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbBETarget,
                                          createThumbDisassembler);
 }
 
diff --git a/lib/Target/ARM/Disassembler/Android.mk b/lib/Target/ARM/Disassembler/Android.mk
index 45a5407..3db61a5 100644
--- a/lib/Target/ARM/Disassembler/Android.mk
+++ b/lib/Target/ARM/Disassembler/Android.mk
@@ -11,6 +11,7 @@ arm_disassembler_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -29,6 +30,7 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/ARM/Disassembler/CMakeLists.txt b/lib/Target/ARM/Disassembler/CMakeLists.txt
index 9de6e5c..2d9d534 100644
--- a/lib/Target/ARM/Disassembler/CMakeLists.txt
+++ b/lib/Target/ARM/Disassembler/CMakeLists.txt
@@ -1,13 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMARMDisassembler
   ARMDisassembler.cpp
   )
-# workaround for hanging compilation on MSVC8, 9 and 10
-if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-set_property(
-  SOURCE ARMDisassembler.cpp
-  PROPERTY COMPILE_FLAGS "/Od"
-  )
-endif()
-add_dependencies(LLVMARMDisassembler ARMCommonTableGen)
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index f897028..da3fe01 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -307,17 +307,30 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       << markup(">");
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    // If a symbolic branch target was added as a constant expression then print
-    // that address in hex. And only print 32 unsigned bits for the address.
-    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
-    int64_t Address;
-    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-      O << "0x";
-      O.write_hex((uint32_t)Address);
+    const MCExpr *Expr = Op.getExpr();
+    switch (Expr->getKind()) {
+    case MCExpr::Binary:
+      O << '#' << *Expr;
+      break;
+    case MCExpr::Constant: {
+      // If a symbolic branch target was added as a constant expression then
+      // print that address in hex. And only print 32 unsigned bits for the
+      // address.
+      const MCConstantExpr *Constant = cast<MCConstantExpr>(Expr);
+      int64_t TargetAddress;
+      if (!Constant->EvaluateAsAbsolute(TargetAddress)) {
+        O << '#' << *Expr;
+      } else {
+        O << "0x";
+        O.write_hex(static_cast<uint32_t>(TargetAddress));
+      }
+      break;
     }
-    else {
-      // Otherwise, just print the expression.
-      O << *Op.getExpr();
+    default:
+      // FIXME: Should we always treat this as if it is a constant literal and
+      // prefix it with '#'?
+      O << *Expr;
+      break;
     }
   }
 }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 15ae8d1..f671fe4 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -26,8 +26,8 @@ public:
   ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                  const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/ARM/InstPrinter/Android.mk b/lib/Target/ARM/InstPrinter/Android.mk
index a047f6f..cefed76 100644
--- a/lib/Target/ARM/InstPrinter/Android.mk
+++ b/lib/Target/ARM/InstPrinter/Android.mk
@@ -33,6 +33,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -52,4 +53,5 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
diff --git a/lib/Target/ARM/InstPrinter/CMakeLists.txt b/lib/Target/ARM/InstPrinter/CMakeLists.txt
index e2d4819..e59ec4b 100644
--- a/lib/Target/ARM/InstPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMARMAsmPrinter
   ARMInstPrinter.cpp
   )
-
-add_dependencies(LLVMARMAsmPrinter ARMCommonTableGen)
diff --git a/lib/Target/ARM/LLVMBuild.txt b/lib/Target/ARM/LLVMBuild.txt
index fd4b3a3..9ed51df 100644
--- a/lib/Target/ARM/LLVMBuild.txt
+++ b/lib/Target/ARM/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = ARMCodeGen
 parent = ARM
-required_libraries = ARMAsmPrinter ARMDesc ARMInfo Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
+required_libraries = ARMAsmPrinter ARMDesc ARMInfo Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = ARM
diff --git a/lib/Target/ARM/MCTargetDesc/ARMArchName.def b/lib/Target/ARM/MCTargetDesc/ARMArchName.def
new file mode 100644
index 0000000..9f007a0
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMArchName.def
@@ -0,0 +1,50 @@
+//===-- ARMArchName.def - List of the ARM arch names ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM architecture names,
+// i.e. the supported value for -march= option.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_ARCH_NAME
+#error "You must define ARM_ARCH_NAME before including ARMArchName.def"
+#endif
+
+// ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH)
+ARM_ARCH_NAME("armv2",   ARMV2,   "2",       v4)
+ARM_ARCH_NAME("armv2a",  ARMV2A,  "2A",      v4)
+ARM_ARCH_NAME("armv3",   ARMV3,   "3",       v4)
+ARM_ARCH_NAME("armv3m",  ARMV3M,  "3M",      v4)
+ARM_ARCH_NAME("armv4",   ARMV4,   "4",       v4)
+ARM_ARCH_NAME("armv4t",  ARMV4T,  "4T",      v4T)
+ARM_ARCH_NAME("armv5",   ARMV5,   "5",       v5T)
+ARM_ARCH_NAME("armv5t",  ARMV5T,  "5T",      v5T)
+ARM_ARCH_NAME("armv5te", ARMV5TE, "5TE",     v5TE)
+ARM_ARCH_NAME("armv6",   ARMV6,   "6",       v6)
+ARM_ARCH_NAME("armv6j",  ARMV6J,  "6J",      v6)
+ARM_ARCH_NAME("armv6t2", ARMV6T2, "6T2",     v6T2)
+ARM_ARCH_NAME("armv6z",  ARMV6Z,  "6Z",      v6KZ)
+ARM_ARCH_NAME("armv6zk", ARMV6ZK, "6ZK",     v6KZ)
+ARM_ARCH_NAME("armv6-m", ARMV6M,  "6-M",     v6_M)
+ARM_ARCH_NAME("armv7",   ARMV7,   "7",       v7)
+ARM_ARCH_NAME("armv7-a", ARMV7A,  "7-A",     v7)
+ARM_ARCH_ALIAS("armv7a", ARMV7A)
+ARM_ARCH_NAME("armv7-r", ARMV7R,  "7-R",     v7)
+ARM_ARCH_ALIAS("armv7r", ARMV7R)
+ARM_ARCH_NAME("armv7-m", ARMV7M,  "7-M",     v7)
+ARM_ARCH_ALIAS("armv7m", ARMV7M)
+ARM_ARCH_NAME("armv8-a", ARMV8A,  "8-A",     v8)
+ARM_ARCH_ALIAS("armv8a", ARMV8A)
+ARM_ARCH_NAME("iwmmxt",  IWMMXT,  "iwmmxt",  v5TE)
+ARM_ARCH_NAME("iwmmxt2", IWMMXT2, "iwmmxt2", v5TE)
+
+#undef ARM_ARCH_NAME
+#undef ARM_ARCH_ALIAS
diff --git a/lib/Target/ARM/MCTargetDesc/ARMArchName.h b/lib/Target/ARM/MCTargetDesc/ARMArchName.h
new file mode 100644
index 0000000..34b9fc1
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMArchName.h
@@ -0,0 +1,27 @@
+//===-- ARMArchName.h - List of the ARM arch names --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMARCHNAME_H
+#define ARMARCHNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum ArchKind {
+  INVALID_ARCH = 0
+
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) , ID
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif // ARMARCHNAME_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 5615b80..1db517f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -41,24 +41,27 @@ public:
 
 class ARMAsmBackend : public MCAsmBackend {
   const MCSubtargetInfo* STI;
-  bool isThumbMode;  // Currently emitting Thumb code.
+  bool isThumbMode;     // Currently emitting Thumb code.
+  bool IsLittleEndian;  // Big or little endian.
 public:
-  ARMAsmBackend(const Target &T, const StringRef TT)
+  ARMAsmBackend(const Target &T, const StringRef TT, bool IsLittle)
     : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
-      isThumbMode(TT.startswith("thumb")) {}
+      isThumbMode(TT.startswith("thumb")), IsLittleEndian(IsLittle) {}
 
   ~ARMAsmBackend() {
     delete STI;
   }
 
-  unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; }
+  unsigned getNumFixupKinds() const override {
+    return ARM::NumTargetFixupKinds;
+  }
 
   bool hasNOP() const {
     return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0;
   }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
 // This table *must* be in the order that the fixup_* kinds are defined in
 // ARMFixupKinds.h.
 //
@@ -94,10 +97,43 @@ public:
 { "fixup_arm_movw_lo16",     0,            20,  0 },
 { "fixup_t2_movt_hi16",      0,            20,  0 },
 { "fixup_t2_movw_lo16",      0,            20,  0 },
-{ "fixup_arm_movt_hi16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_movw_lo16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_movt_hi16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_movw_lo16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+    };
+    const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// ARMFixupKinds.h.
+//
+// Name                      Offset (bits) Size (bits)     Flags
+{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_pcrel_10_unscaled", 0,        32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_thumb_adr_pcrel_10",8,            8,   MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_condbranch",    8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbranch",  8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbl",      8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_condbl",        8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_blx",           8,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      8,             8,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_thumb_bcc",     8,             8,  MCFixupKindInfo::FKF_IsPCRel },
+// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
+{ "fixup_arm_movt_hi16",     12,           20,  0 },
+{ "fixup_arm_movw_lo16",     12,           20,  0 },
+{ "fixup_t2_movt_hi16",      12,           20,  0 },
+{ "fixup_t2_movw_lo16",      12,           20,  0 },
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -105,32 +141,31 @@ public:
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
+    return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
   }
 
   /// processFixupValue - Target hook to process the literal value of a fixup
   /// if necessary.
   void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
                          const MCFixup &Fixup, const MCFragment *DF,
-                         MCValue &Target, uint64_t &Value,
-                         bool &IsResolved);
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
 
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const;
+                  uint64_t Value, bool IsPCRel) const override;
 
-  bool mayNeedRelaxation(const MCInst &Inst) const;
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
 
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
+                            const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void handleAssemblerFlag(MCAssemblerFlag Flag) {
+  void handleAssemblerFlag(MCAssemblerFlag Flag) override {
     switch (Flag) {
     default: break;
     case MCAF_Code16:
@@ -145,6 +180,7 @@ public:
   unsigned getPointerSize() const { return 4; }
   bool isThumb() const { return isThumbMode; }
   void setIsThumb(bool it) { isThumbMode = it; }
+  bool isLittle() const { return IsLittleEndian; }
 };
 } // end anonymous namespace
 
@@ -155,6 +191,8 @@ static unsigned getRelaxedOpcode(unsigned Op) {
   case ARM::tLDRpci:    return ARM::t2LDRpci;
   case ARM::tADR:       return ARM::t2ADR;
   case ARM::tB:         return ARM::t2B;
+  case ARM::tCBZ:       return ARM::tHINT;
+  case ARM::tCBNZ:      return ARM::tHINT;
   }
 }
 
@@ -196,6 +234,12 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
     int64_t Offset = int64_t(Value) - 4;
     return Offset > 1020 || Offset < 0 || Offset & 3;
   }
+  case ARM::fixup_arm_thumb_cb:
+    // If we have a Thumb CBZ or CBNZ instruction and its target is the next
+    // instruction it is is actually out of range for the instruction.
+    // It will be changed to a NOP.
+    int64_t Offset = (Value & ~1);
+    return Offset == 2;
   }
   llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!");
 }
@@ -212,7 +256,18 @@ void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
     report_fatal_error("unexpected instruction to relax: " + OS.str());
   }
 
-  // The instructions we're relaxing have (so far) the same operands.
+  // If we are changing Thumb CBZ or CBNZ instruction to a NOP, aka tHINT, we
+  // have to change the operands too.
+  if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) &&
+      RelaxedOp == ARM::tHINT) {
+    Res.setOpcode(RelaxedOp);
+    Res.addOperand(MCOperand::CreateImm(0));
+    Res.addOperand(MCOperand::CreateImm(14));
+    Res.addOperand(MCOperand::CreateReg(0));
+    return;
+  } 
+
+  // The rest of instructions we're relaxing have the same operands.
   // We just need to update to the proper opcode.
   Res = Inst;
   Res.setOpcode(RelaxedOp);
@@ -252,7 +307,7 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 }
 
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx = NULL) {
+                                 bool IsPCRel, MCContext *Ctx) {
   unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
@@ -262,11 +317,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_4:
     return Value;
   case ARM::fixup_arm_movt_hi16:
-    Value >>= 16;
+    if (!IsPCRel)
+      Value >>= 16;
     // Fallthrough
-  case ARM::fixup_arm_movw_lo16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
-  case ARM::fixup_arm_movw_lo16_pcrel: {
+  case ARM::fixup_arm_movw_lo16: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned Lo12 = Value & 0x0FFF;
     // inst{19-16} = Hi4;
@@ -275,12 +329,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     return Value;
   }
   case ARM::fixup_t2_movt_hi16:
-    Value >>= 16;
+    if (!IsPCRel)
+      Value >>= 16;
     // Fallthrough
-  case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movt_hi16_pcrel:  //FIXME: Shouldn't this be shifted like
-                                       // the other hi16 fixup?
-  case ARM::fixup_t2_movw_lo16_pcrel: {
+  case ARM::fixup_t2_movw_lo16: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned i = (Value & 0x800) >> 11;
     unsigned Mid3 = (Value & 0x700) >> 8;
@@ -361,6 +413,9 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case ARM::fixup_arm_blx:
     // These values don't encode the low two bits since they're always zero.
     // Offset by 8 just as above.
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+        return 0;
     return 0xffffff & ((Value - 8) >> 2);
   case ARM::fixup_t2_uncondbranch: {
     Value = Value - 4;
@@ -399,65 +454,67 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     return swapped;
   }
   case ARM::fixup_arm_thumb_bl: {
-     // The value doesn't encode the low bit (always zero) and is offset by
-     // four. The 32-bit immediate value is encoded as
-     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
-     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
-     // The value is encoded into disjoint bit positions in the destination
-     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
-     // J = either J1 or J2 bit
-     //
-     //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
-     //
-     // Note that the halfwords are stored high first, low second; so we need
-     // to transpose the fixup value here to map properly.
-     uint32_t offset = (Value - 4) >> 1;
-     uint32_t signBit = (offset & 0x800000) >> 23;
-     uint32_t I1Bit = (offset & 0x400000) >> 22;
-     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
-     uint32_t I2Bit = (offset & 0x200000) >> 21;
-     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
-     uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
-     uint32_t imm11Bits = (offset & 0x000007FF);
-
-     uint32_t Binary = 0;
-     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
-     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
-                           (uint16_t)imm11Bits);
-     Binary |= secondHalf << 16;
-     Binary |= firstHalf;
-     return Binary;
-
+    // The value doesn't encode the low bit (always zero) and is offset by
+    // four. The 32-bit immediate value is encoded as
+    //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+    // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+    // The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+    // J = either J1 or J2 bit
+    //
+    //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    uint32_t offset = (Value - 4) >> 1;
+    uint32_t signBit = (offset & 0x800000) >> 23;
+    uint32_t I1Bit = (offset & 0x400000) >> 22;
+    uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+    uint32_t I2Bit = (offset & 0x200000) >> 21;
+    uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+    uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+    uint32_t imm11Bits = (offset & 0x000007FF);
+
+    uint32_t Binary = 0;
+    uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+    uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                          (uint16_t)imm11Bits);
+    Binary |= secondHalf << 16;
+    Binary |= firstHalf;
+    return Binary;
   }
   case ARM::fixup_arm_thumb_blx: {
-     // The value doesn't encode the low two bits (always zero) and is offset by
-     // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
-     //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
-     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
-     // The value is encoded into disjoint bit positions in the destination
-     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
-     // J = either J1 or J2 bit, 0 = zero.
-     //
-     //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
-     //
-     // Note that the halfwords are stored high first, low second; so we need
-     // to transpose the fixup value here to map properly.
-     uint32_t offset = (Value - 2) >> 2;
-     uint32_t signBit = (offset & 0x400000) >> 22;
-     uint32_t I1Bit = (offset & 0x200000) >> 21;
-     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
-     uint32_t I2Bit = (offset & 0x100000) >> 20;
-     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
-     uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
-     uint32_t imm10LBits = (offset & 0x3FF);
-
-     uint32_t Binary = 0;
-     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
-     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
-                           ((uint16_t)imm10LBits) << 1);
-     Binary |= secondHalf << 16;
-     Binary |= firstHalf;
-     return Binary;
+    // The value doesn't encode the low two bits (always zero) and is offset by
+    // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+    //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+    // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+    // The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+    // J = either J1 or J2 bit, 0 = zero.
+    //
+    //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    uint32_t offset = (Value - 2) >> 2;
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
+        offset = 0;
+    uint32_t signBit = (offset & 0x400000) >> 22;
+    uint32_t I1Bit = (offset & 0x200000) >> 21;
+    uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+    uint32_t I2Bit = (offset & 0x100000) >> 20;
+    uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+    uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+    uint32_t imm10LBits = (offset & 0x3FF);
+
+    uint32_t Binary = 0;
+    uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+    uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                          ((uint16_t)imm10LBits) << 1);
+    Binary |= secondHalf << 16;
+    Binary |= firstHalf;
+    return Binary;
   }
   case ARM::fixup_arm_thumb_cp:
     // Offset by 4, and don't encode the low two bits. Two bytes of that
@@ -524,7 +581,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
                                       const MCAsmLayout &Layout,
                                       const MCFixup &Fixup,
                                       const MCFragment *DF,
-                                      MCValue &Target, uint64_t &Value,
+                                      const MCValue &Target, uint64_t &Value,
                                       bool &IsResolved) {
   const MCSymbolRefExpr *A = Target.getSymA();
   // Some fixups to thumb function symbols need the low bit (thumb bit)
@@ -541,11 +598,18 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
         Value |= 1;
     }
   }
+  // For Thumb1 BL instruction, it is possible to be a long jump between
+  // the basic blocks of the same function.  Thus, we would like to resolve
+  // the offset when the destination has the same MCFragment.
+  if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+    const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+    MCSymbolData &SymData = Asm.getSymbolData(Sym);
+    IsResolved = (SymData.getFragment() == DF);
+  }
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
   // symbol's thumb-ness to get interworking right.
   if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
-            (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
             (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
             (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
             (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
@@ -554,7 +618,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // Try to get the encoded value for the fixup as-if we're mapping it into
   // the instruction. This allows adjustFixupValue() to issue a diagnostic
   // if the value aren't invalid.
-  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext());
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -595,30 +659,81 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_arm_thumb_blx:
   case ARM::fixup_arm_movt_hi16:
   case ARM::fixup_arm_movw_lo16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
-  case ARM::fixup_arm_movw_lo16_pcrel:
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
-  case ARM::fixup_t2_movw_lo16_pcrel:
+    return 4;
+  }
+}
+
+/// getFixupKindContainerSizeBytes - The number of bytes of the
+/// container involved in big endian.
+static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    // Instruction size is 2 bytes.
+    return 2;
+
+  case ARM::fixup_arm_pcrel_10_unscaled:
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+    // Instruction size is 4 bytes.
     return 4;
   }
 }
 
 void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value) const {
+                               unsigned DataSize, uint64_t Value,
+                               bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup, Value);
+  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
   assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
 
+  // Used to point to big endian bytes.
+  unsigned FullSizeBytes;
+  if (!IsLittleEndian)
+    FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value. The Value has been "split up" into the appropriate
   // bitfields above.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i);
+    Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+  }
 }
 
 namespace {
@@ -629,11 +744,11 @@ class ELFARMAsmBackend : public ARMAsmBackend {
 public:
   uint8_t OSABI;
   ELFARMAsmBackend(const Target &T, const StringRef TT,
-                   uint8_t _OSABI)
-    : ARMAsmBackend(T, TT), OSABI(_OSABI) { }
+                   uint8_t OSABI, bool IsLittle)
+    : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) { }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createARMELFObjectWriter(OS, OSABI);
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMELFObjectWriter(OS, OSABI, isLittle());
   }
 };
 
@@ -643,29 +758,26 @@ public:
   const MachO::CPUSubTypeARM Subtype;
   DarwinARMAsmBackend(const Target &T, const StringRef TT,
                       MachO::CPUSubTypeARM st)
-    : ARMAsmBackend(T, TT), Subtype(st) {
+    : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) {
       HasDataInCodeSupport = true;
     }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
                                      MachO::CPU_TYPE_ARM,
                                      Subtype);
   }
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    return false;
-  }
 };
 
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU) {
+                                        StringRef TT, StringRef CPU,
+                                        bool isLittle) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin()) {
+  if (TheTriple.isOSBinFormatMachO()) {
     MachO::CPUSubTypeARM CS =
       StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
       .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
@@ -673,7 +785,6 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
       .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
       .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
       .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
-      .Cases("armv7f", "thumbv7f", MachO::CPU_SUBTYPE_ARM_V7F)
       .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
       .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
       .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
@@ -689,5 +800,30 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
 #endif
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-  return new ELFARMAsmBackend(T, TT, OSABI);
+  return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+}
+
+MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, true);
+}
+
+MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, false);
 }
+
+MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, true);
+}
+
+MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  return createARMAsmBackend(T, MRI, TT, CPU, false);
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index af939fc..42a1cbb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -183,7 +183,8 @@ namespace ARM_ISB {
 
   inline static const char *InstSyncBOptToString(unsigned val) {
     switch (val) {
-      default: llvm_unreachable("Unkown memory operation");
+    default:
+      llvm_unreachable("Unknown memory operation");
       case RESERVED_0:  return "#0x0";
       case RESERVED_1:  return "#0x1";
       case RESERVED_2:  return "#0x2";
@@ -278,42 +279,36 @@ namespace ARMII {
     //===------------------------------------------------------------------===//
     // ARM Specific MachineOperand flags.
 
-    MO_NO_FLAG,
+    MO_NO_FLAG = 0,
 
     /// MO_LO16 - On a symbol operand, this represents a relocation containing
     /// lower 16 bit of the address. Used only via movw instruction.
-    MO_LO16,
+    MO_LO16 = 0x1,
 
     /// MO_HI16 - On a symbol operand, this represents a relocation containing
     /// higher 16 bit of the address. Used only via movt instruction.
-    MO_HI16,
-
-    /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
-    /// i.e. "FOO$non_lazy_ptr".
-    /// Used only via movw instruction.
-    MO_LO16_NONLAZY,
-
-    /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
-    /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction.
-    MO_HI16_NONLAZY,
-
-    /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the PC relative address of the
-    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
-    /// Used only via movw instruction.
-    MO_LO16_NONLAZY_PIC,
-
-    /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
-    /// relocation containing lower 16 bit of the PC relative address of the
-    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
-    /// Used only via movt instruction.
-    MO_HI16_NONLAZY_PIC,
+    MO_HI16 = 0x2,
 
     /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
     /// call operand.
-    MO_PLT
+    MO_PLT = 0x3,
+
+    /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
+    /// just that part of the flag set.
+    MO_OPTION_MASK = 0x7f,
+
+    /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
+    /// represents a symbol which, if indirect, will get special Darwin mangling
+    /// as a non-lazy-ptr indirect symbol (i.e. "L_FOO$non_lazy_ptr"). Can be
+    /// combined with MO_LO16, MO_HI16 or MO_NO_FLAG (in a constant-pool, for
+    /// example).
+    MO_NONLAZY = 0x80,
+
+    // It's undefined behaviour if an enum overflows the range between its
+    // smallest and largest values, but since these are |ed together, it can
+    // happen. Put a sentinel in (values of this enum are stored as "unsigned
+    // char").
+    MO_UNUSED_MAXIMUM = 0xff
   };
 
   enum {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index f98bbd2..a4661b1 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -34,14 +34,10 @@ namespace {
 
     virtual ~ARMELFObjectWriter();
 
-    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                  bool IsPCRel, bool IsRelocWithSymbol,
-                                  int64_t Addend) const;
-    virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
-                                   const MCValue &Target,
-                                   const MCFragment &F,
-                                   const MCFixup &Fixup,
-                                   bool IsPCRel) const;
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(unsigned Type) const override;
   };
 }
 
@@ -52,91 +48,18 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
 
 ARMELFObjectWriter::~ARMELFObjectWriter() {}
 
-// In ARM, _MergedGlobals and other most symbols get emitted directly.
-// I.e. not as an offset to a section symbol.
-// This code is an approximation of what ARM/gcc does.
-
-STATISTIC(PCRelCount, "Total number of PIC Relocations");
-STATISTIC(NonPCRelCount, "Total number of non-PIC relocations");
-
-const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
-                                                   const MCValue &Target,
-                                                   const MCFragment &F,
-                                                   const MCFixup &Fixup,
-                                                   bool IsPCRel) const {
-  const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
-  bool EmitThisSym = false;
-
-  const MCSectionELF &Section =
-    static_cast<const MCSectionELF&>(Symbol.getSection());
-  bool InNormalSection = true;
-  unsigned RelocType = 0;
-  RelocType = GetRelocTypeInner(Target, Fixup, IsPCRel);
-
-  DEBUG(
-      const MCSymbolRefExpr::VariantKind Kind = Target.getSymA()->getKind();
-      MCSymbolRefExpr::VariantKind Kind2;
-      Kind2 = Target.getSymB() ?  Target.getSymB()->getKind() :
-        MCSymbolRefExpr::VK_None;
-      dbgs() << "considering symbol "
-        << Section.getSectionName() << "/"
-        << Symbol.getName() << "/"
-        << " Rel:" << (unsigned)RelocType
-        << " Kind: " << (int)Kind << "/" << (int)Kind2
-        << " Tmp:"
-        << Symbol.isAbsolute() << "/" << Symbol.isDefined() << "/"
-        << Symbol.isVariable() << "/" << Symbol.isTemporary()
-        << " Counts:" << PCRelCount << "/" << NonPCRelCount << "\n");
-
-  if (IsPCRel) { ++PCRelCount;
-    switch (RelocType) {
-    default:
-      // Most relocation types are emitted as explicit symbols
-      InNormalSection =
-        StringSwitch<bool>(Section.getSectionName())
-        .Case(".data.rel.ro.local", false)
-        .Case(".data.rel", false)
-        .Case(".bss", false)
-        .Default(true);
-      EmitThisSym = true;
-      break;
-    case ELF::R_ARM_ABS32:
-      // But things get strange with R_ARM_ABS32
-      // In this case, most things that go in .rodata show up
-      // as section relative relocations
-      InNormalSection =
-        StringSwitch<bool>(Section.getSectionName())
-        .Case(".data.rel.ro.local", false)
-        .Case(".data.rel", false)
-        .Case(".rodata", false)
-        .Case(".bss", false)
-        .Default(true);
-      EmitThisSym = false;
-      break;
-    }
-  } else {
-    NonPCRelCount++;
-    InNormalSection =
-      StringSwitch<bool>(Section.getSectionName())
-      .Case(".data.rel.ro.local", false)
-      .Case(".rodata", false)
-      .Case(".data.rel", false)
-      .Case(".bss", false)
-      .Default(true);
-
-    switch (RelocType) {
-    default: EmitThisSym = true; break;
-    case ELF::R_ARM_ABS32: EmitThisSym = false; break;
-    case ELF::R_ARM_PREL31: EmitThisSym = false; break;
-    }
-  }
-
-  if (EmitThisSym)
-    return &Symbol;
-  if (! Symbol.isTemporary() && InNormalSection) {
-    return &Symbol;
+bool ARMELFObjectWriter::needsRelocateWithSymbol(unsigned Type) const {
+  // FIXME: This is extremelly conservative. This really needs to use a
+  // whitelist with a clear explanation for why each realocation needs to
+  // point to the symbol, not to the section.
+  switch (Type) {
+  default:
+    return true;
+
+  case ELF::R_ARM_PREL31:
+  case ELF::R_ARM_ABS32:
+    return false;
   }
-  return NULL;
 }
 
 // Need to examine the Fixup when determining whether to 
@@ -144,17 +67,14 @@ const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
 // offset
 unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
                                           const MCFixup &Fixup,
-                                          bool IsPCRel,
-                                          bool IsRelocWithSymbol,
-                                          int64_t Addend) const {
+                                          bool IsPCRel) const {
   return GetRelocTypeInner(Target, Fixup, IsPCRel);
 }
 
 unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const  {
-  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
-    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
 
   unsigned Type = 0;
   if (IsPCRel) {
@@ -166,9 +86,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_ARM_REL32;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSGD:
+      case MCSymbolRefExpr::VK_TLSGD:
         llvm_unreachable("unimplemented");
-      case MCSymbolRefExpr::VK_ARM_GOTTPOFF:
+      case MCSymbolRefExpr::VK_GOTTPOFF:
         Type = ELF::R_ARM_TLS_IE32;
         break;
       }
@@ -176,9 +96,12 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_blx:
     case ARM::fixup_arm_uncondbl:
       switch (Modifier) {
-      case MCSymbolRefExpr::VK_ARM_PLT:
+      case MCSymbolRefExpr::VK_PLT:
         Type = ELF::R_ARM_PLT32;
         break;
+      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+        Type = ELF::R_ARM_TLS_CALL;
+        break;
       default:
         Type = ELF::R_ARM_CALL;
         break;
@@ -194,24 +117,27 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       Type = ELF::R_ARM_THM_JUMP24;
       break;
     case ARM::fixup_arm_movt_hi16:
-    case ARM::fixup_arm_movt_hi16_pcrel:
       Type = ELF::R_ARM_MOVT_PREL;
       break;
     case ARM::fixup_arm_movw_lo16:
-    case ARM::fixup_arm_movw_lo16_pcrel:
       Type = ELF::R_ARM_MOVW_PREL_NC;
       break;
     case ARM::fixup_t2_movt_hi16:
-    case ARM::fixup_t2_movt_hi16_pcrel:
       Type = ELF::R_ARM_THM_MOVT_PREL;
       break;
     case ARM::fixup_t2_movw_lo16:
-    case ARM::fixup_t2_movw_lo16_pcrel:
       Type = ELF::R_ARM_THM_MOVW_PREL_NC;
       break;
     case ARM::fixup_arm_thumb_bl:
     case ARM::fixup_arm_thumb_blx:
-      Type = ELF::R_ARM_THM_CALL;
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+        Type = ELF::R_ARM_THM_TLS_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_THM_CALL;
+        break;
+      }
       break;
     }
   } else {
@@ -223,22 +149,22 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_NONE:
         Type = ELF::R_ARM_NONE;
         break;
-      case MCSymbolRefExpr::VK_ARM_GOT:
+      case MCSymbolRefExpr::VK_GOT:
         Type = ELF::R_ARM_GOT_BREL;
         break;
-      case MCSymbolRefExpr::VK_ARM_TLSGD:
+      case MCSymbolRefExpr::VK_TLSGD:
         Type = ELF::R_ARM_TLS_GD32;
         break;
-      case MCSymbolRefExpr::VK_ARM_TPOFF:
+      case MCSymbolRefExpr::VK_TPOFF:
         Type = ELF::R_ARM_TLS_LE32;
         break;
-      case MCSymbolRefExpr::VK_ARM_GOTTPOFF:
+      case MCSymbolRefExpr::VK_GOTTPOFF:
         Type = ELF::R_ARM_TLS_IE32;
         break;
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_ARM_ABS32;
         break;
-      case MCSymbolRefExpr::VK_ARM_GOTOFF:
+      case MCSymbolRefExpr::VK_GOTOFF:
         Type = ELF::R_ARM_GOTOFF32;
         break;
       case MCSymbolRefExpr::VK_ARM_TARGET1:
@@ -250,6 +176,18 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_PREL31:
         Type = ELF::R_ARM_PREL31;
         break;
+      case MCSymbolRefExpr::VK_ARM_TLSLDO:
+        Type = ELF::R_ARM_TLS_LDO32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSCALL:
+        Type = ELF::R_ARM_TLS_CALL;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSDESC:
+        Type = ELF::R_ARM_TLS_GOTDESC;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ:
+        Type = ELF::R_ARM_TLS_DESCSEQ;
+        break;
       }
       break;
     case ARM::fixup_arm_ldst_pcrel_12:
@@ -283,7 +221,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
 }
 
 MCObjectWriter *llvm::createARMELFObjectWriter(raw_ostream &OS,
-                                               uint8_t OSABI) {
+                                               uint8_t OSABI,
+                                               bool IsLittleEndian) {
   MCELFObjectTargetWriter *MOTW = new ARMELFObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 471897d..5a01d26 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -13,14 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMBuildAttrs.h"
+#include "ARMArchName.h"
 #include "ARMFPUName.h"
 #include "ARMRegisterInfo.h"
-#include "ARMUnwindOp.h"
 #include "ARMUnwindOpAsm.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -37,16 +37,20 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
 using namespace llvm;
 
 static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
-  assert(Index < NUM_PERSONALITY_INDEX && "Invalid personality index");
+  assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+         "Invalid personality index");
   return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
 }
 
@@ -61,6 +65,45 @@ static const char *GetFPUName(unsigned ID) {
   return NULL;
 }
 
+static const char *GetArchName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH kind");
+    break;
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+  case ARM::ID: return NAME;
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+  }
+  return NULL;
+}
+
+static const char *GetArchDefaultCPUName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH kind");
+    break;
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+  case ARM::ID: return DEFAULT_CPU_NAME;
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+  }
+  return NULL;
+}
+
+static unsigned GetArchDefaultCPUArch(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown ARCH kind");
+    break;
+#define ARM_ARCH_NAME(NAME, ID, DEFAULT_CPU_NAME, DEFAULT_CPU_ARCH) \
+  case ARM::ID: return ARMBuildAttrs::DEFAULT_CPU_ARCH;
+#define ARM_ARCH_ALIAS(NAME, ID) /* empty */
+#include "ARMArchName.def"
+  }
+  return 0;
+}
+
 namespace {
 
 class ARMELFStreamer;
@@ -68,36 +111,55 @@ class ARMELFStreamer;
 class ARMTargetAsmStreamer : public ARMTargetStreamer {
   formatted_raw_ostream &OS;
   MCInstPrinter &InstPrinter;
-
-  virtual void emitFnStart();
-  virtual void emitFnEnd();
-  virtual void emitCantUnwind();
-  virtual void emitPersonality(const MCSymbol *Personality);
-  virtual void emitHandlerData();
-  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
-  virtual void emitPad(int64_t Offset);
-  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                           bool isVector);
-
-  virtual void switchVendor(StringRef Vendor);
-  virtual void emitAttribute(unsigned Attribute, unsigned Value);
-  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
-  virtual void emitFPU(unsigned FPU);
-  virtual void finishAttributeSection();
+  bool IsVerboseAsm;
+
+  void emitFnStart() override;
+  void emitFnEnd() override;
+  void emitCantUnwind() override;
+  void emitPersonality(const MCSymbol *Personality) override;
+  void emitPersonalityIndex(unsigned Index) override;
+  void emitHandlerData() override;
+  void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+  void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+  void emitPad(int64_t Offset) override;
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                   bool isVector) override;
+  void emitUnwindRaw(int64_t Offset,
+                     const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+  void switchVendor(StringRef Vendor) override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StrinValue) override;
+  void emitArch(unsigned Arch) override;
+  void emitObjectArch(unsigned Arch) override;
+  void emitFPU(unsigned FPU) override;
+  void emitInst(uint32_t Inst, char Suffix = '\0') override;
+  void finishAttributeSection() override;
+
+  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
 
 public:
-  ARMTargetAsmStreamer(formatted_raw_ostream &OS, MCInstPrinter &InstPrinter);
+  ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                       MCInstPrinter &InstPrinter, bool VerboseAsm);
 };
 
-ARMTargetAsmStreamer::ARMTargetAsmStreamer(formatted_raw_ostream &OS,
-                                           MCInstPrinter &InstPrinter)
-    : OS(OS), InstPrinter(InstPrinter) {}
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
+                                           formatted_raw_ostream &OS,
+                                           MCInstPrinter &InstPrinter,
+                                           bool VerboseAsm)
+    : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
+      IsVerboseAsm(VerboseAsm) {}
 void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
 void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
 void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
 void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
   OS << "\t.personality " << Personality->getName() << '\n';
 }
+void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) {
+  OS << "\t.personalityindex " << Index << '\n';
+}
 void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
 void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
                                      int64_t Offset) {
@@ -109,6 +171,16 @@ void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
     OS << ", #" << Offset;
   OS << '\n';
 }
+void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  assert((Reg != ARM::SP && Reg != ARM::PC) &&
+         "the operand of .movsp cannot be either sp or pc");
+
+  OS << "\t.movsp\t";
+  InstPrinter.printRegName(OS, Reg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
 void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
   OS << "\t.pad\t#" << Offset << '\n';
 }
@@ -132,22 +204,78 @@ void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
 void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
 }
 void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
-  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value) << "\n";
+  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
+  if (IsVerboseAsm) {
+    StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+    if (!Name.empty())
+      OS << "\t@ " << Name;
+  }
+  OS << "\n";
 }
 void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
                                              StringRef String) {
   switch (Attribute) {
-  default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
   case ARMBuildAttrs::CPU_name:
-    OS << "\t.cpu\t" << String.lower() << "\n";
+    OS << "\t.cpu\t" << String.lower();
+    break;
+  default:
+    OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
+    if (IsVerboseAsm) {
+      StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+      if (!Name.empty())
+        OS << "\t@ " << Name;
+    }
     break;
   }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                unsigned IntValue,
+                                                StringRef StringValue) {
+  switch (Attribute) {
+  default: llvm_unreachable("unsupported multi-value attribute in asm mode");
+  case ARMBuildAttrs::compatibility:
+    OS << "\t.eabi_attribute\t" << Attribute << ", " << IntValue;
+    if (!StringValue.empty())
+      OS << ", \"" << StringValue << "\"";
+    if (IsVerboseAsm)
+      OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute);
+    break;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
+  OS << "\t.arch\t" << GetArchName(Arch) << "\n";
+}
+void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
+  OS << "\t.object_arch\t" << GetArchName(Arch) << '\n';
 }
 void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
   OS << "\t.fpu\t" << GetFPUName(FPU) << "\n";
 }
 void ARMTargetAsmStreamer::finishAttributeSection() {
 }
+void
+ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+  OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
+}
+
+void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
+  OS << "\t.inst";
+  if (Suffix)
+    OS << "." << Suffix;
+  OS << "\t0x" << utohexstr(Inst) << "\n";
+}
+
+void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+  OS << "\t.unwind_raw " << Offset;
+  for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(),
+                                                OCE = Opcodes.end();
+       OCI != OCE; ++OCI)
+    OS << ", 0x" << utohexstr(*OCI);
+  OS << '\n';
+}
 
 class ARMTargetELFStreamer : public ARMTargetStreamer {
 private:
@@ -158,7 +286,8 @@ private:
     enum {
       HiddenAttribute = 0,
       NumericAttribute,
-      TextAttribute
+      TextAttribute,
+      NumericAndTextAttributes
     } Type;
     unsigned Tag;
     unsigned IntValue;
@@ -171,21 +300,12 @@ private:
 
   StringRef CurrentVendor;
   unsigned FPU;
+  unsigned Arch;
+  unsigned EmittedArch;
   SmallVector<AttributeItem, 64> Contents;
 
   const MCSection *AttributeSection;
 
-  // FIXME: this should be in a more generic place, but
-  // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
-  static size_t getULEBSize(int Value) {
-    size_t Size = 0;
-    do {
-      Value >>= 7;
-      Size += sizeof(int8_t); // Is this really necessary?
-    } while (Value);
-    return Size;
-  }
-
   AttributeItem *getAttributeItem(unsigned Attribute) {
     for (size_t i = 0; i < Contents.size(); ++i)
       if (Contents[i].Tag == Attribute)
@@ -199,6 +319,7 @@ private:
     if (AttributeItem *Item = getAttributeItem(Attribute)) {
       if (!OverwriteExisting)
         return;
+      Item->Type = AttributeItem::NumericAttribute;
       Item->IntValue = Value;
       return;
     }
@@ -219,6 +340,7 @@ private:
     if (AttributeItem *Item = getAttributeItem(Attribute)) {
       if (!OverwriteExisting)
         return;
+      Item->Type = AttributeItem::TextAttribute;
       Item->StringValue = Value;
       return;
     }
@@ -233,33 +355,67 @@ private:
     Contents.push_back(Item);
   }
 
+  void setAttributeItems(unsigned Attribute, unsigned IntValue,
+                         StringRef StringValue, bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::NumericAndTextAttributes;
+      Item->IntValue = IntValue;
+      Item->StringValue = StringValue;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAndTextAttributes,
+      Attribute,
+      IntValue,
+      StringValue
+    };
+    Contents.push_back(Item);
+  }
+
+  void emitArchDefaultAttributes();
   void emitFPUDefaultAttributes();
 
   ARMELFStreamer &getStreamer();
 
-  virtual void emitFnStart();
-  virtual void emitFnEnd();
-  virtual void emitCantUnwind();
-  virtual void emitPersonality(const MCSymbol *Personality);
-  virtual void emitHandlerData();
-  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
-  virtual void emitPad(int64_t Offset);
-  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                           bool isVector);
-
-  virtual void switchVendor(StringRef Vendor);
-  virtual void emitAttribute(unsigned Attribute, unsigned Value);
-  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
-  virtual void emitFPU(unsigned FPU);
-  virtual void finishAttributeSection();
+  void emitFnStart() override;
+  void emitFnEnd() override;
+  void emitCantUnwind() override;
+  void emitPersonality(const MCSymbol *Personality) override;
+  void emitPersonalityIndex(unsigned Index) override;
+  void emitHandlerData() override;
+  void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+  void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+  void emitPad(int64_t Offset) override;
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                   bool isVector) override;
+  void emitUnwindRaw(int64_t Offset,
+                     const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+  void switchVendor(StringRef Vendor) override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StringValue) override;
+  void emitArch(unsigned Arch) override;
+  void emitObjectArch(unsigned Arch) override;
+  void emitFPU(unsigned FPU) override;
+  void emitInst(uint32_t Inst, char Suffix = '\0') override;
+  void finishAttributeSection() override;
+
+  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
 
   size_t calculateContentSize() const;
 
 public:
-  ARMTargetELFStreamer()
-    : ARMTargetStreamer(), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
-      AttributeSection(0) {
-  }
+  ARMTargetELFStreamer(MCStreamer &S)
+    : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
+      Arch(ARM::INVALID_ARCH), EmittedArch(ARM::INVALID_ARCH),
+      AttributeSection(0) {}
 };
 
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
@@ -278,30 +434,32 @@ class ARMELFStreamer : public MCELFStreamer {
 public:
   friend class ARMTargetELFStreamer;
 
-  ARMELFStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
-                 MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter,
-                 bool IsThumb)
-      : MCELFStreamer(Context, TargetStreamer, TAB, OS, Emitter),
-        IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) {
+  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                 MCCodeEmitter *Emitter, bool IsThumb)
+      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
+        MappingSymbolCounter(0), LastEMS(EMS_None) {
     Reset();
   }
 
   ~ARMELFStreamer() {}
 
-  virtual void FinishImpl();
+  void FinishImpl() override;
 
   // ARM exception handling directives
   void emitFnStart();
   void emitFnEnd();
   void emitCantUnwind();
   void emitPersonality(const MCSymbol *Per);
+  void emitPersonalityIndex(unsigned index);
   void emitHandlerData();
   void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+  void emitMovSP(unsigned Reg, int64_t Offset = 0);
   void emitPad(int64_t Offset);
   void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
+  void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
 
-  virtual void ChangeSection(const MCSection *Section,
-                             const MCExpr *Subsection) {
+  void ChangeSection(const MCSection *Section,
+                     const MCExpr *Subsection) override {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
     // default constructor by DenseMap::lookup.
@@ -314,19 +472,58 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  virtual void EmitInstruction(const MCInst& Inst) {
+  void EmitInstruction(const MCInst& Inst,
+                       const MCSubtargetInfo &STI) override {
     if (IsThumb)
       EmitThumbMappingSymbol();
     else
       EmitARMMappingSymbol();
 
-    MCELFStreamer::EmitInstruction(Inst);
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  void emitInst(uint32_t Inst, char Suffix) {
+    unsigned Size;
+    char Buffer[4];
+    const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+    switch (Suffix) {
+    case '\0':
+      Size = 4;
+
+      assert(!IsThumb);
+      EmitARMMappingSymbol();
+      for (unsigned II = 0, IE = Size; II != IE; II++) {
+        const unsigned I = LittleEndian ? (Size - II - 1) : II;
+        Buffer[Size - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+      }
+
+      break;
+    case 'n':
+    case 'w':
+      Size = (Suffix == 'n' ? 2 : 4);
+
+      assert(IsThumb);
+      EmitThumbMappingSymbol();
+      for (unsigned II = 0, IE = Size; II != IE; II = II + 2) {
+        const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1);
+        const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2);
+        Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT);
+        Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT);
+      }
+
+      break;
+    default:
+      llvm_unreachable("Invalid Suffix");
+    }
+
+    MCELFStreamer::EmitBytes(StringRef(Buffer, Size));
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  virtual void EmitBytes(StringRef Data) {
+  void EmitBytes(StringRef Data) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitBytes(Data);
   }
@@ -334,12 +531,12 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size);
   }
 
-  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
     MCELFStreamer::EmitAssemblerFlag(Flag);
 
     switch (Flag) {
@@ -402,7 +599,7 @@ private:
     Symbol->setVariableValue(Value);
   }
 
-  void EmitThumbFunc(MCSymbol *Func) {
+  void EmitThumbFunc(MCSymbol *Func) override {
     // FIXME: Anything needed here to flag the function as thumb?
 
     getAssembler().setIsThumbFunc(Func);
@@ -423,6 +620,8 @@ private:
   void SwitchToExTabSection(const MCSymbol &FnStart);
   void SwitchToExIdxSection(const MCSymbol &FnStart);
 
+  void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
+
   bool IsThumb;
   int64_t MappingSymbolCounter;
 
@@ -446,8 +645,7 @@ private:
 } // end anonymous namespace
 
 ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
-  ARMELFStreamer *S = static_cast<ARMELFStreamer *>(Streamer);
-  return *S;
+  return static_cast<ARMELFStreamer &>(Streamer);
 }
 
 void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
@@ -456,6 +654,9 @@ void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
 void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
   getStreamer().emitPersonality(Personality);
 }
+void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) {
+  getStreamer().emitPersonalityIndex(Index);
+}
 void ARMTargetELFStreamer::emitHandlerData() {
   getStreamer().emitHandlerData();
 }
@@ -463,6 +664,9 @@ void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
                                      int64_t Offset) {
   getStreamer().emitSetFP(FpReg, SpReg, Offset);
 }
+void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  getStreamer().emitMovSP(Reg, Offset);
+}
 void ARMTargetELFStreamer::emitPad(int64_t Offset) {
   getStreamer().emitPad(Offset);
 }
@@ -470,6 +674,10 @@ void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                        bool isVector) {
   getStreamer().emitRegSave(RegList, isVector);
 }
+void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+  getStreamer().emitUnwindRaw(Offset, Opcodes);
+}
 void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
   assert(!Vendor.empty() && "Vendor cannot be empty.");
 
@@ -491,6 +699,108 @@ void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
                                              StringRef Value) {
   setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
 }
+void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                unsigned IntValue,
+                                                StringRef StringValue) {
+  setAttributeItems(Attribute, IntValue, StringValue,
+                    /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitArch(unsigned Value) {
+  Arch = Value;
+}
+void ARMTargetELFStreamer::emitObjectArch(unsigned Value) {
+  EmittedArch = Value;
+}
+void ARMTargetELFStreamer::emitArchDefaultAttributes() {
+  using namespace ARMBuildAttrs;
+
+  setAttributeItem(CPU_name, GetArchDefaultCPUName(Arch), false);
+  if (EmittedArch == ARM::INVALID_ARCH)
+    setAttributeItem(CPU_arch, GetArchDefaultCPUArch(Arch), false);
+  else
+    setAttributeItem(CPU_arch, GetArchDefaultCPUArch(EmittedArch), false);
+
+  switch (Arch) {
+  case ARM::ARMV2:
+  case ARM::ARMV2A:
+  case ARM::ARMV3:
+  case ARM::ARMV3M:
+  case ARM::ARMV4:
+  case ARM::ARMV5:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    break;
+
+  case ARM::ARMV4T:
+  case ARM::ARMV5T:
+  case ARM::ARMV5TE:
+  case ARM::ARMV6:
+  case ARM::ARMV6J:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    break;
+
+  case ARM::ARMV6T2:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV6Z:
+  case ARM::ARMV6ZK:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(Virtualization_use, AllowTZ, false);
+    break;
+
+  case ARM::ARMV6M:
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    break;
+
+  case ARM::ARMV7:
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV7A:
+    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV7R:
+    setAttributeItem(CPU_arch_profile, RealTimeProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV7M:
+    setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::ARMV8A:
+    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    setAttributeItem(MPextension_use, Allowed, false);
+    setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
+    break;
+
+  case ARM::IWMMXT:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(WMMX_arch, AllowWMMXv1, false);
+    break;
+
+  case ARM::IWMMXT2:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(WMMX_arch, AllowWMMXv2, false);
+    break;
+
+  default:
+    report_fatal_error("Unknown Arch: " + Twine(Arch));
+    break;
+  }
+}
 void ARMTargetELFStreamer::emitFPU(unsigned Value) {
   FPU = Value;
 }
@@ -498,43 +808,43 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
   switch (FPU) {
   case ARM::VFP:
   case ARM::VFPV2:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv2,
                      /* OverwriteExisting= */ false);
     break;
 
   case ARM::VFPV3:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv3A,
                      /* OverwriteExisting= */ false);
     break;
 
   case ARM::VFPV3_D16:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv3B,
                      /* OverwriteExisting= */ false);
     break;
 
   case ARM::VFPV4:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv4A,
                      /* OverwriteExisting= */ false);
     break;
 
   case ARM::VFPV4_D16:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv4B,
                      /* OverwriteExisting= */ false);
     break;
 
   case ARM::FP_ARMV8:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPARMv8A,
                      /* OverwriteExisting= */ false);
     break;
 
   case ARM::NEON:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv3A,
                      /* OverwriteExisting= */ false);
     setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
@@ -543,7 +853,7 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
     break;
 
   case ARM::NEON_VFPV4:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv4A,
                      /* OverwriteExisting= */ false);
     setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
@@ -553,7 +863,7 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
 
   case ARM::NEON_FP_ARMV8:
   case ARM::CRYPTO_NEON_FP_ARMV8:
-    setAttributeItem(ARMBuildAttrs::VFP_arch,
+    setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPARMv8A,
                      /* OverwriteExisting= */ false);
     setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
@@ -561,6 +871,9 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
                      /* OverwriteExisting= */ false);
     break;
 
+  case ARM::SOFTVFP:
+    break;
+
   default:
     report_fatal_error("Unknown FPU: " + Twine(FPU));
     break;
@@ -574,13 +887,18 @@ size_t ARMTargetELFStreamer::calculateContentSize() const {
     case AttributeItem::HiddenAttribute:
       break;
     case AttributeItem::NumericAttribute:
-      Result += getULEBSize(item.Tag);
-      Result += getULEBSize(item.IntValue);
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
       break;
     case AttributeItem::TextAttribute:
-      Result += getULEBSize(item.Tag);
+      Result += getULEB128Size(item.Tag);
       Result += item.StringValue.size() + 1; // string + '\0'
       break;
+    case AttributeItem::NumericAndTextAttributes:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      Result += item.StringValue.size() + 1; // string + '\0';
+      break;
     }
   }
   return Result;
@@ -597,6 +915,9 @@ void ARMTargetELFStreamer::finishAttributeSection() {
   if (FPU != ARM::INVALID_FPU)
     emitFPUDefaultAttributes();
 
+  if (Arch != ARM::INVALID_ARCH)
+    emitArchDefaultAttributes();
+
   if (Contents.empty())
     return;
 
@@ -648,15 +969,27 @@ void ARMTargetELFStreamer::finishAttributeSection() {
       Streamer.EmitBytes(item.StringValue.upper());
       Streamer.EmitIntValue(0, 1); // '\0'
       break;
+    case AttributeItem::NumericAndTextAttributes:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
     }
   }
 
   Contents.clear();
   FPU = ARM::INVALID_FPU;
 }
+void
+ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+  getStreamer().EmitFixup(S, FK_Data_4);
+}
+void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
+  getStreamer().emitInst(Inst, Suffix);
+}
 
 void ARMELFStreamer::FinishImpl() {
-  MCTargetStreamer &TS = getTargetStreamer();
+  MCTargetStreamer &TS = *getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   ATS.finishAttributeSection();
 
@@ -691,7 +1024,7 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
 
   // Switch to .ARM.extab or .ARM.exidx section
   SwitchSection(EHSection);
-  EmitCodeAlignment(4, 0);
+  EmitCodeAlignment(4);
 }
 
 inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
@@ -709,12 +1042,17 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
                     SectionKind::getDataRel(),
                     FnStart);
 }
+void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
+  MCDataFragment *Frag = getOrCreateDataFragment();
+  Frag->getFixups().push_back(MCFixup::Create(Frag->getContents().size(), Expr,
+                                              Kind));
+}
 
 void ARMELFStreamer::Reset() {
   ExTab = NULL;
   FnStart = NULL;
   Personality = NULL;
-  PersonalityIndex = NUM_PERSONALITY_INDEX;
+  PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
   FPReg = ARM::SP;
   FPOffset = 0;
   SPOffset = 0;
@@ -733,7 +1071,7 @@ void ARMELFStreamer::emitFnStart() {
 }
 
 void ARMELFStreamer::emitFnEnd() {
-  assert(FnStart && ".fnstart must preceeds .fnend");
+  assert(FnStart && ".fnstart must precedes .fnend");
 
   // Emit unwind opcodes if there is no .handlerdata directive
   if (!ExTab && !CantUnwind)
@@ -742,7 +1080,7 @@ void ARMELFStreamer::emitFnEnd() {
   // Emit the exception index table entry
   SwitchToExIdxSection(*FnStart);
 
-  if (PersonalityIndex < NUM_PERSONALITY_INDEX)
+  if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX)
     EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
 
   const MCSymbolRefExpr *FnStartRef =
@@ -753,7 +1091,7 @@ void ARMELFStreamer::emitFnEnd() {
   EmitValue(FnStartRef, 4);
 
   if (CantUnwind) {
-    EmitIntValue(EXIDX_CANTUNWIND, 4);
+    EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4);
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
@@ -765,7 +1103,7 @@ void ARMELFStreamer::emitFnEnd() {
     // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
     // the second word of exception index table entry.  The size of the unwind
     // opcodes should always be 4 bytes.
-    assert(PersonalityIndex == AEABI_UNWIND_CPP_PR0 &&
+    assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 &&
            "Compact model must use __aeabi_cpp_unwind_pr0 as personality");
     assert(Opcodes.size() == 4u &&
            "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4");
@@ -820,7 +1158,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
   // section.  Thus, we don't have to create an entry in the .ARM.extab
   // section.
-  if (NoHandlerData && PersonalityIndex == AEABI_UNWIND_CPP_PR0)
+  if (NoHandlerData && PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0)
     return;
 
   // Switch to .ARM.extab section.
@@ -863,6 +1201,11 @@ void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
   UnwindOpAsm.setPersonality(Per);
 }
 
+void ARMELFStreamer::emitPersonalityIndex(unsigned Index) {
+  assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX && "invalid index");
+  PersonalityIndex = Index;
+}
+
 void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
                                int64_t Offset) {
   assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
@@ -877,6 +1220,20 @@ void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
     FPOffset += Offset;
 }
 
+void ARMELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  assert((Reg != ARM::SP && Reg != ARM::PC) &&
+         "the operand of .movsp cannot be either sp or pc");
+  assert(FPReg == ARM::SP && "current FP must be SP");
+
+  FlushPendingOffset();
+
+  FPReg = Reg;
+  FPOffset = SPOffset + Offset;
+
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+  UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+}
+
 void ARMELFStreamer::emitPad(int64_t Offset) {
   // Track the change of the $sp offset
   SPOffset -= Offset;
@@ -916,27 +1273,33 @@ void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
     UnwindOpAsm.EmitRegSave(Mask);
 }
 
+void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
+                                   const SmallVectorImpl<uint8_t> &Opcodes) {
+  FlushPendingOffset();
+  SPOffset = SPOffset - Offset;
+  UnwindOpAsm.EmitRaw(Opcodes);
+}
+
 namespace llvm {
 
 MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool isVerboseAsm, bool useCFI,
                                 bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst) {
-  ARMTargetAsmStreamer *S = new ARMTargetAsmStreamer(OS, *InstPrint);
-
-  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
-                                 useDwarfDirectory, InstPrint, CE, TAB,
-                                 ShowInst);
+  MCStreamer *S =
+      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
+                              InstPrint, CE, TAB, ShowInst);
+  new ARMTargetAsmStreamer(*S, OS, *InstPrint, isVerboseAsm);
+  return S;
 }
 
   MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                       raw_ostream &OS, MCCodeEmitter *Emitter,
                                       bool RelaxAll, bool NoExecStack,
                                       bool IsThumb) {
-    ARMTargetELFStreamer *TS = new ARMTargetELFStreamer();
-    ARMELFStreamer *S =
-        new ARMELFStreamer(Context, TS, TAB, OS, Emitter, IsThumb);
+    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    new ARMTargetELFStreamer(*S);
     // FIXME: This should eventually end up somewhere else where more
     // intelligent flag decisions can be made. For now we are just maintaining
     // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 0085feb..bfd9e33 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -100,15 +100,6 @@ enum Fixups {
   fixup_t2_movt_hi16, // :upper16:
   fixup_t2_movw_lo16, // :lower16:
 
-  // It is possible to create an "immediate" that happens to be pcrel.
-  // movw r0, :lower16:Foo-(Bar+8) and movt  r0, :upper16:Foo-(Bar+8)
-  // result in different reloc tags than the above two.
-  // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC
-  fixup_arm_movt_hi16_pcrel, // :upper16:
-  fixup_arm_movw_lo16_pcrel, // :lower16:
-  fixup_t2_movt_hi16_pcrel, // :upper16:
-  fixup_t2_movw_lo16_pcrel, // :lower16:
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index ad796e6..b7f96e0 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -13,18 +13,18 @@
 
 #include "ARMMCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/Triple.h"
 
 using namespace llvm;
 
-cl::opt<bool>
-EnableARMEHABI("arm-enable-ehabi", cl::Hidden,
-  cl::desc("Generate ARM EHABI tables"),
-  cl::init(false));
-
-
 void ARMMCAsmInfoDarwin::anchor() { }
 
-ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(StringRef TT) {
+  Triple TheTriple(TT);
+  if ((TheTriple.getArch() == Triple::armeb) ||
+      (TheTriple.getArch() == Triple::thumbeb))
+    IsLittleEndian = false;
+
   Data64bitsDirective = 0;
   CommentString = "@";
   Code16Directive = ".code\t16";
@@ -35,17 +35,23 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::SjLj;
+
+  UseIntegratedAssembler = true;
 }
 
 void ARMELFMCAsmInfo::anchor() { }
 
-ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
+ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  if ((TheTriple.getArch() == Triple::armeb) ||
+      (TheTriple.getArch() == Triple::thumbeb))
+    IsLittleEndian = false;
+
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
   Data64bitsDirective = 0;
   CommentString = "@";
-  PrivateGlobalPrefix = ".L";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
 
@@ -53,6 +59,48 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  if (EnableARMEHABI)
-    ExceptionsType = ExceptionHandling::ARM;
+  ExceptionsType = ExceptionHandling::ARM;
+
+  // foo(plt) instead of foo@plt
+  UseParensForSymbolVariant = true;
+
+  UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) {
+  UseIntegratedAssembler = Value;
+  if (!UseIntegratedAssembler) {
+    // gas doesn't handle VFP register names in cfi directives,
+    // so don't use register names with external assembler.
+    // See https://sourceware.org/bugzilla/show_bug.cgi?id=16694
+    DwarfRegNumForCFI = true;
+  }
+}
+
+void ARMCOFFMCAsmInfoMicrosoft::anchor() { }
+
+ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
+  AlignmentIsInBytes = false;
+
+  PrivateGlobalPrefix = "$M";
 }
+
+void ARMCOFFMCAsmInfoGNU::anchor() { }
+
+ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
+  AlignmentIsInBytes = false;
+
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+  PrivateGlobalPrefix = ".L";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::None;
+  UseParensForSymbolVariant = true;
+
+  UseIntegratedAssembler = false;
+  DwarfRegNumForCFI = true;
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index e1f716d..beaf6a4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -14,21 +14,36 @@
 #ifndef LLVM_ARMTARGETASMINFO_H
 #define LLVM_ARMTARGETASMINFO_H
 
+#include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
   class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
+    void anchor() override;
   public:
-    explicit ARMMCAsmInfoDarwin();
+    explicit ARMMCAsmInfoDarwin(StringRef TT);
   };
 
   class ARMELFMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
-    explicit ARMELFMCAsmInfo();
+    explicit ARMELFMCAsmInfo(StringRef TT);
+
+    void setUseIntegratedAssembler(bool Value) override;
+  };
+
+  class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+    void anchor();
+  public:
+    explicit ARMCOFFMCAsmInfoMicrosoft();
+  };
+
+  class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
+    void anchor();
+  public:
+    explicit ARMCOFFMCAsmInfoGNU();
   };
 
 } // namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 4382d0d..5564e0a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -38,27 +39,25 @@ class ARMMCCodeEmitter : public MCCodeEmitter {
   ARMMCCodeEmitter(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   void operator=(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
-  const MCSubtargetInfo &STI;
   const MCContext &CTX;
+  bool IsLittleEndian;
 
 public:
-  ARMMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx)
-    : MCII(mcii), STI(sti), CTX(ctx) {
+  ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle)
+    : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) {
   }
 
   ~ARMMCCodeEmitter() {}
 
-  bool isThumb() const {
-    // FIXME: Can tablegen auto-generate this?
+  bool isThumb(const MCSubtargetInfo &STI) const {
     return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
   }
-  bool isThumb2() const {
-    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0;
+  bool isThumb2(const MCSubtargetInfo &STI) const {
+    return isThumb(STI) && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0;
   }
-  bool isTargetDarwin() const {
+  bool isTargetMachO(const MCSubtargetInfo &STI) const {
     Triple TT(STI.getTargetTriple());
-  	return TT.isOSDarwin();
+    return TT.isOSBinFormatMachO();
   }
 
   unsigned getMachineSoImmOpValue(unsigned SoImm) const;
@@ -66,107 +65,131 @@ public:
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
   /// getMachineOpValue - Return binary encoding of operand. If the machine
   /// operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
   /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
   /// the specified operand. This is used for operands with :lower16: and
   /// :upper16: prefixes.
   uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
                               unsigned &Reg, unsigned &Imm,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
 
   /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
   /// BL branch target.
   uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
   /// BLX branch target.
   uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups) const;
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
   /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
   uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
   uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups) const;
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
   /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
   uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
   /// branch target.
   uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 
   /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
   /// immediate Thumb2 direct branch target.
   uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 
   /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
   /// branch target.
   uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
   uint32_t getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
   uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 
   /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
   /// ADR label target.
   uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
 
 
   /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
   /// operand.
   uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
   uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups)const;
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const;
 
   /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
   /// operand.
   uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
   /// operand.
   uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups) const;
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
 
   /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
   /// operand.
   uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
 
 
   /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
   /// operand as needed by load/store instructions.
   uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
   uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
     ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
     switch (Mode) {
     default: llvm_unreachable("Unknown addressing sub-mode!");
@@ -192,44 +215,54 @@ public:
 
   /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
   uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
   uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
   /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
   uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
   /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
   uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
   /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
   uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
   /// operand.
   uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
   /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
   uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
   /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
   uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
   /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
   uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getCCOutOpValue - Return encoding of the 's' bit.
   unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
     // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
     // '1' respectively.
     return MI.getOperand(Op).getReg() == ARM::CPSR;
@@ -237,8 +270,27 @@ public:
 
   /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
   unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
-    unsigned SoImm = MI.getOperand(Op).getImm();
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+    const MCOperand &MO = MI.getOperand(Op);
+
+    // We expect MO to be an immediate or an expression,
+    // if it is an immediate - that's fine, just encode the value.
+    // Otherwise - create a Fixup.
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      // In instruction code this value always encoded as lowest 12 bits,
+      // so we don't have to perform any specific adjustments.
+      // Due to requirements of relocatable records we have to use FK_Data_4.
+      // See ARMELFObjectWriter::ExplicitRelSym and
+      //     ARMELFObjectWriter::GetRelocTypeInner for more details.
+      MCFixupKind Kind = MCFixupKind(FK_Data_4);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+      return 0;
+    }
+
+    unsigned SoImm = MO.getImm();
     int SoImmVal = ARM_AM::getSOImmVal(SoImm);
     assert(SoImmVal != -1 && "Not a valid so_imm value!");
 
@@ -253,7 +305,8 @@ public:
 
   /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
   unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
     unsigned SoImm = MI.getOperand(Op).getImm();
     unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
     assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
@@ -261,64 +314,88 @@ public:
   }
 
   unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
   unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
   unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
   unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-    SmallVectorImpl<MCFixup> &Fixups) const;
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
 
   /// getSORegOpValue - Return an encoded so_reg shifted register value.
   unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
   unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
   unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
   unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
-                                   SmallVectorImpl<MCFixup> &Fixups) const {
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const {
     return 64 - MI.getOperand(Op).getImm();
   }
 
   unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
-                                      SmallVectorImpl<MCFixup> &Fixups) const;
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
   unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
   unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
-                                      SmallVectorImpl<MCFixup> &Fixups) const;
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
   unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
-                                        SmallVectorImpl<MCFixup> &Fixups) const;
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
   unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
-                                        SmallVectorImpl<MCFixup> &Fixups) const;
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
   unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
   unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
 
   unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
   unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
-                                      unsigned EncodedValue) const;
+                                      unsigned EncodedValue,
+                                      const MCSubtargetInfo &STI) const;
   unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
-                                          unsigned EncodedValue) const;
+                                          unsigned EncodedValue,
+                                          const MCSubtargetInfo &STI) const;
   unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
-                                    unsigned EncodedValue) const;
+                                    unsigned EncodedValue,
+                                    const MCSubtargetInfo &STI) const;
   unsigned NEONThumb2V8PostEncoder(const MCInst &MI,
-                                   unsigned EncodedValue) const;
+                                   unsigned EncodedValue,
+                                   const MCSubtargetInfo &STI) const;
 
   unsigned VFPThumb2PostEncoder(const MCInst &MI,
-                                unsigned EncodedValue) const;
+                                unsigned EncodedValue,
+                                const MCSubtargetInfo &STI) const;
 
   void EmitByte(unsigned char C, raw_ostream &OS) const {
     OS << (char)C;
@@ -327,30 +404,39 @@ public:
   void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the constant in little endian byte order.
     for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
     }
   }
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
 };
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
-                                            const MCSubtargetInfo &STI,
-                                            MCContext &Ctx) {
-  return new ARMMCCodeEmitter(MCII, STI, Ctx);
+MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, true);
+}
+
+MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, false);
 }
 
 /// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
 /// instructions, and rewrite them to their Thumb2 form if we are currently in
 /// Thumb2 mode.
 unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
     // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
     // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
     // set to 1111.
@@ -368,8 +454,9 @@ unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
 /// instructions, and rewrite them to their Thumb2 form if we are currently in
 /// Thumb2 mode.
 unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
     EncodedValue &= 0xF0FFFFFF;
     EncodedValue |= 0x09000000;
   }
@@ -381,8 +468,9 @@ unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
 /// instructions, and rewrite them to their Thumb2 form if we are currently in
 /// Thumb2 mode.
 unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
     EncodedValue &= 0x00FFFFFF;
     EncodedValue |= 0xEE000000;
   }
@@ -393,8 +481,9 @@ unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
 /// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form
 /// if we are in Thumb2.
 unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
-                                                 unsigned EncodedValue) const {
-  if (isThumb2()) {
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
     EncodedValue |= 0xC000000; // Set bits 27-26
   }
 
@@ -404,8 +493,9 @@ unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
 /// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
 /// them to their Thumb2 form if we are currently in Thumb2 mode.
 unsigned ARMMCCodeEmitter::
-VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const {
-  if (isThumb2()) {
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue,
+                     const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
     EncodedValue &= 0x0FFFFFFF;
     EncodedValue |= 0xE0000000;
   }
@@ -416,7 +506,8 @@ VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const {
 /// operand requires relocation, record the relocation and return zero.
 unsigned ARMMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
     unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
@@ -444,7 +535,8 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 /// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
 bool ARMMCCodeEmitter::
 EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
-                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const {
+                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
 
@@ -473,7 +565,8 @@ EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
 /// which is either an immediate or requires a fixup.
 static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                                        unsigned FixupKind,
-                                       SmallVectorImpl<MCFixup> &Fixups) {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
   // If the destination is an immediate, we have nothing to do.
@@ -509,11 +602,12 @@ static int32_t encodeThumbBLOffset(int32_t offset) {
 /// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
-                                    Fixups);
+                                    Fixups, STI);
   return encodeThumbBLOffset(MO.getImm());
 }
 
@@ -521,43 +615,47 @@ getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
 /// BLX branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
-                                    Fixups);
+                                    Fixups, STI);
   return encodeThumbBLOffset(MO.getImm());
 }
 
 /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
-                                    Fixups);
+                                    Fixups, STI);
   return (MO.getImm() >> 1);
 }
 
 /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
-                                    Fixups);
+                                    Fixups, STI);
   return (MO.getImm() >> 1);
 }
 
 /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
-    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups, STI);
   return (MO.getImm() >> 1);
 }
 
@@ -582,27 +680,29 @@ static bool HasConditionalBranch(const MCInst &MI) {
 /// target.
 uint32_t ARMMCCodeEmitter::
 getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
   // FIXME: This really, really shouldn't use TargetMachine. We don't want
   // coupling between MC and TM anywhere we can help it.
-  if (isThumb2())
+  if (isThumb2(STI))
     return
-      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups);
-  return getARMBranchTargetOpValue(MI, OpIdx, Fixups);
+      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups, STI);
+  return getARMBranchTargetOpValue(MI, OpIdx, Fixups, STI);
 }
 
 /// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
 /// target.
 uint32_t ARMMCCodeEmitter::
 getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr()) {
     if (HasConditionalBranch(MI))
       return ::getBranchTargetOpValue(MI, OpIdx,
-                                      ARM::fixup_arm_condbranch, Fixups);
+                                      ARM::fixup_arm_condbranch, Fixups, STI);
     return ::getBranchTargetOpValue(MI, OpIdx,
-                                    ARM::fixup_arm_uncondbranch, Fixups);
+                                    ARM::fixup_arm_uncondbranch, Fixups, STI);
   }
 
   return MO.getImm() >> 2;
@@ -610,13 +710,14 @@ getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
 
 uint32_t ARMMCCodeEmitter::
 getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr()) {
     if (HasConditionalBranch(MI))
       return ::getBranchTargetOpValue(MI, OpIdx, 
-                                      ARM::fixup_arm_condbl, Fixups);
-    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups);
+                                      ARM::fixup_arm_condbl, Fixups, STI);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups, STI);
   }
 
   return MO.getImm() >> 2;
@@ -624,10 +725,11 @@ getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
 
 uint32_t ARMMCCodeEmitter::
 getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
-    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups, STI);
 
   return MO.getImm() >> 1;
 }
@@ -636,12 +738,13 @@ getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
 /// immediate branch target.
 uint32_t ARMMCCodeEmitter::
 getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
   unsigned Val = 0;
   const MCOperand MO = MI.getOperand(OpIdx);
     
   if(MO.isExpr())
-    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups, STI);
   else 
     Val = MO.getImm() >> 1;
 
@@ -665,11 +768,12 @@ getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
 /// ADR label target.
 uint32_t ARMMCCodeEmitter::
 getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
-                                    Fixups);
+                                    Fixups, STI);
   int64_t offset = MO.getImm();
   uint32_t Val = 0x2000;
 
@@ -705,11 +809,12 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
 /// target.
 uint32_t ARMMCCodeEmitter::
 getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
-                                    Fixups);
+                                    Fixups, STI);
   int32_t Val = MO.getImm();
   if (Val == INT32_MIN)
     Val = 0x1000;
@@ -724,11 +829,12 @@ getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
 /// target.
 uint32_t ARMMCCodeEmitter::
 getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
-                                    Fixups);
+                                    Fixups, STI);
   return MO.getImm();
 }
 
@@ -736,7 +842,8 @@ getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
 /// operand.
 uint32_t ARMMCCodeEmitter::
 getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &) const {
+                              SmallVectorImpl<MCFixup> &,
+                              const MCSubtargetInfo &STI) const {
   // [Rn, Rm]
   //   {5-3} = Rm
   //   {2-0} = Rn
@@ -750,7 +857,8 @@ getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
 /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
 uint32_t ARMMCCodeEmitter::
 getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
   // {17-13} = reg
   // {12}    = (U)nsigned (add == '1', sub == '0')
   // {11-0}  = imm12
@@ -767,7 +875,7 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
       isAdd = false ; // 'U' bit is set as part of the fixup.
 
       MCFixupKind Kind;
-      if (isThumb2())
+      if (isThumb2(STI))
         Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
       else
         Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
@@ -787,7 +895,7 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
       Imm12 = Offset;
     }
   } else
-    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups);
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI);
 
   uint32_t Binary = Imm12 & 0xfff;
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
@@ -801,7 +909,8 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
 /// '+/- imm8<<2' operand.
 uint32_t ARMMCCodeEmitter::
 getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   // FIXME: The immediate operand should have already been encoded like this
   // before ever getting here. The encoder method should just need to combine
   // the MI operands for the register and the offset into a single
@@ -832,7 +941,8 @@ getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
 /// 'reg +/- imm8<<2' operand.
 uint32_t ARMMCCodeEmitter::
 getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
   // {12-9} = reg
   // {8}    = (U)nsigned (add == '1', sub == '0')
   // {7-0}  = imm8
@@ -852,7 +962,7 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
 
     ++MCNumCPRelocations;
   } else
-    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
 
   // FIXME: The immediate operand should have already been encoded like this
   // before ever getting here. The encoder method should just need to combine
@@ -872,7 +982,8 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
 /// 'reg + imm8<<2' operand.
 uint32_t ARMMCCodeEmitter::
 getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
-                        SmallVectorImpl<MCFixup> &Fixups) const {
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
   // {11-8} = reg
   // {7-0}  = imm8
   const MCOperand &MO = MI.getOperand(OpIdx);
@@ -882,22 +993,10 @@ getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
   return (Reg << 8) | Imm8;
 }
 
-// FIXME: This routine assumes that a binary
-// expression will always result in a PCRel expression
-// In reality, its only true if one or more subexpressions
-// is itself a PCRel (i.e. "." in asm or some other pcrel construct)
-// but this is good enough for now.
-static bool EvaluateAsPCRel(const MCExpr *Expr) {
-  switch (Expr->getKind()) {
-  default: llvm_unreachable("Unexpected expression type");
-  case MCExpr::SymbolRef: return false;
-  case MCExpr::Binary: return true;
-  }
-}
-
 uint32_t
 ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups) const {
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
   // {20-16} = imm{15-12}
   // {11-0}  = imm{11-0}
   const MCOperand &MO = MI.getOperand(OpIdx);
@@ -912,27 +1011,29 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
     const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
     E = ARM16Expr->getSubExpr();
 
+    if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) {
+      const int64_t Value = MCE->getValue();
+      if (Value > UINT32_MAX)
+        report_fatal_error("constant value truncated (limited to 32-bit)");
+
+      switch (ARM16Expr->getKind()) {
+      case ARMMCExpr::VK_ARM_HI16:
+        return (int32_t(Value) & 0xffff0000) >> 16;
+      case ARMMCExpr::VK_ARM_LO16:
+        return (int32_t(Value) & 0x0000ffff);
+      default: llvm_unreachable("Unsupported ARMFixup");
+      }
+    }
+
     switch (ARM16Expr->getKind()) {
     default: llvm_unreachable("Unsupported ARMFixup");
     case ARMMCExpr::VK_ARM_HI16:
-      if (!isTargetDarwin() && EvaluateAsPCRel(E))
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movt_hi16_pcrel
-                           : ARM::fixup_arm_movt_hi16_pcrel);
-      else
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movt_hi16
-                           : ARM::fixup_arm_movt_hi16);
+      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movt_hi16
+                                       : ARM::fixup_arm_movt_hi16);
       break;
     case ARMMCExpr::VK_ARM_LO16:
-      if (!isTargetDarwin() && EvaluateAsPCRel(E))
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movw_lo16_pcrel
-                           : ARM::fixup_arm_movw_lo16_pcrel);
-      else
-        Kind = MCFixupKind(isThumb2()
-                           ? ARM::fixup_t2_movw_lo16
-                           : ARM::fixup_arm_movw_lo16);
+      Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
+                                       : ARM::fixup_arm_movw_lo16);
       break;
     }
     Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
@@ -942,21 +1043,16 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
   // it's just a plain immediate expression, and those evaluate to
   // the lower 16 bits of the expression regardless of whether
   // we have a movt or a movw.
-  if (!isTargetDarwin() && EvaluateAsPCRel(E))
-    Kind = MCFixupKind(isThumb2()
-                       ? ARM::fixup_t2_movw_lo16_pcrel
-                       : ARM::fixup_arm_movw_lo16_pcrel);
-  else
-    Kind = MCFixupKind(isThumb2()
-                       ? ARM::fixup_t2_movw_lo16
-                       : ARM::fixup_arm_movw_lo16);
+  Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
+                                   : ARM::fixup_arm_movw_lo16);
   Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
   return 0;
 }
 
 uint32_t ARMMCCodeEmitter::
 getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   const MCOperand &MO2 = MI.getOperand(OpIdx+2);
@@ -989,21 +1085,23 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
 
 uint32_t ARMMCCodeEmitter::
 getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
   // {17-14}  Rn
   // {13}     1 == imm12, 0 == Rm
   // {12}     isAdd
   // {11-0}   imm12/Rm
   const MCOperand &MO = MI.getOperand(OpIdx);
   unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
-  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
+  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups, STI);
   Binary |= Rn << 14;
   return Binary;
 }
 
 uint32_t ARMMCCodeEmitter::
 getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   // {13}     1 == imm12, 0 == Rm
   // {12}     isAdd
   // {11-0}   imm12/Rm
@@ -1025,7 +1123,8 @@ getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
 
 uint32_t ARMMCCodeEmitter::
 getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
   // {4}      isAdd
   // {3-0}    Rm
   const MCOperand &MO = MI.getOperand(OpIdx);
@@ -1036,7 +1135,8 @@ getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
 
 uint32_t ARMMCCodeEmitter::
 getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   // {9}      1 == imm8, 0 == Rm
   // {8}      isAdd
   // {7-4}    imm7_4/zero
@@ -1055,7 +1155,8 @@ getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
 
 uint32_t ARMMCCodeEmitter::
 getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
   // {13}     1 == imm8, 0 == Rm
   // {12-9}   Rn
   // {8}      isAdd
@@ -1091,7 +1192,8 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
 /// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
 uint32_t ARMMCCodeEmitter::
 getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   // [SP, #imm]
   //   {7-0} = imm8
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
@@ -1106,7 +1208,8 @@ getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
 /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
 uint32_t ARMMCCodeEmitter::
 getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
   // [Rn, #imm]
   //   {7-3} = imm5
   //   {2-0} = Rn
@@ -1120,17 +1223,19 @@ getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
 /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
 uint32_t ARMMCCodeEmitter::
 getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
   const MCOperand MO = MI.getOperand(OpIdx);
   if (MO.isExpr())
-    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups, STI);
   return (MO.getImm() >> 2);
 }
 
 /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
 uint32_t ARMMCCodeEmitter::
 getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
   // {12-9} = reg
   // {8}    = (U)nsigned (add == '1', sub == '0')
   // {7-0}  = imm8
@@ -1146,7 +1251,7 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
     assert(MO.isExpr() && "Unexpected machine operand type!");
     const MCExpr *Expr = MO.getExpr();
     MCFixupKind Kind;
-    if (isThumb2())
+    if (isThumb2(STI))
       Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
     else
       Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
@@ -1154,7 +1259,7 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
 
     ++MCNumCPRelocations;
   } else {
-    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
     isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
   }
 
@@ -1168,7 +1273,8 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
 
 unsigned ARMMCCodeEmitter::
 getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
-                SmallVectorImpl<MCFixup> &Fixups) const {
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
   // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
   // shifted. The second is Rs, the amount to shift by, and the third specifies
   // the type of the shift.
@@ -1215,7 +1321,8 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
 
 unsigned ARMMCCodeEmitter::
 getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
-                SmallVectorImpl<MCFixup> &Fixups) const {
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
   // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
   // shifted. The second is the amount to shift by.
   //
@@ -1261,7 +1368,8 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
 
 unsigned ARMMCCodeEmitter::
 getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
-                SmallVectorImpl<MCFixup> &Fixups) const {
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
   const MCOperand &MO1 = MI.getOperand(OpNum);
   const MCOperand &MO2 = MI.getOperand(OpNum+1);
   const MCOperand &MO3 = MI.getOperand(OpNum+2);
@@ -1279,7 +1387,8 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
 
 unsigned ARMMCCodeEmitter::
 getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
   const MCOperand &MO1 = MI.getOperand(OpNum);
   const MCOperand &MO2 = MI.getOperand(OpNum+1);
 
@@ -1300,7 +1409,8 @@ getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
 
 unsigned ARMMCCodeEmitter::
 getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
   const MCOperand &MO1 = MI.getOperand(OpNum);
 
   // FIXME: Needs fixup support.
@@ -1316,7 +1426,8 @@ getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
 
 unsigned ARMMCCodeEmitter::
 getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
   const MCOperand &MO1 = MI.getOperand(OpNum);
 
   // FIXME: Needs fixup support.
@@ -1332,7 +1443,8 @@ getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
 
 unsigned ARMMCCodeEmitter::
 getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
-                SmallVectorImpl<MCFixup> &Fixups) const {
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
   // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
   // shifted. The second is the amount to shift by.
   //
@@ -1374,7 +1486,8 @@ getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
 
 unsigned ARMMCCodeEmitter::
 getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
   // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
   // msb of the mask.
   const MCOperand &MO = MI.getOperand(Op);
@@ -1387,7 +1500,8 @@ getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
 
 unsigned ARMMCCodeEmitter::
 getRegisterListOpValue(const MCInst &MI, unsigned Op,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
   // VLDM/VSTM:
   //   {12-8} = Vd
   //   {7-0}  = Number of registers
@@ -1423,7 +1537,8 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
 /// with the alignment operand.
 unsigned ARMMCCodeEmitter::
 getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
-                           SmallVectorImpl<MCFixup> &Fixups) const {
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
@@ -1446,7 +1561,8 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
 /// along  with the alignment operand for use in VST1 and VLD1 with size 32.
 unsigned ARMMCCodeEmitter::
 getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
-                                    SmallVectorImpl<MCFixup> &Fixups) const {
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
@@ -1472,7 +1588,8 @@ getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
 /// different for VLD4-dup.
 unsigned ARMMCCodeEmitter::
 getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const {
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
@@ -1492,7 +1609,8 @@ getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
 
 unsigned ARMMCCodeEmitter::
 getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
-                          SmallVectorImpl<MCFixup> &Fixups) const {
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(Op);
   if (MO.getReg() == 0) return 0x0D;
   return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
@@ -1500,31 +1618,36 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
 
 unsigned ARMMCCodeEmitter::
 getShiftRight8Imm(const MCInst &MI, unsigned Op,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   return 8 - MI.getOperand(Op).getImm();
 }
 
 unsigned ARMMCCodeEmitter::
 getShiftRight16Imm(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   return 16 - MI.getOperand(Op).getImm();
 }
 
 unsigned ARMMCCodeEmitter::
 getShiftRight32Imm(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   return 32 - MI.getOperand(Op).getImm();
 }
 
 unsigned ARMMCCodeEmitter::
 getShiftRight64Imm(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
   return 64 - MI.getOperand(Op).getImm();
 }
 
 void ARMMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   // Pseudo instructions don't get encoded.
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
@@ -1537,10 +1660,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   else
     llvm_unreachable("Unexpected instruction size!");
 
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
   // Thumb 32-bit wide instructions need to emit the high order halfword
   // first.
-  if (isThumb() && Size == 4) {
+  if (isThumb(STI) && Size == 4) {
     EmitConstant(Binary >> 16, 2, OS);
     EmitConstant(Binary & 0xffff, 2, OS);
   } else
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index cd4067a..d819139 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -56,16 +56,16 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
   // There are no TLS ARMMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index a99de0e..949a3d5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -89,14 +89,16 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   unsigned Idx = 0;
 
   // FIXME: Enhance Triple helper class to extract ARM version.
-  bool isThumb = false;
+  bool isThumb = triple.getArch() == Triple::thumb ||
+                 triple.getArch() == Triple::thumbeb;
   if (Len >= 5 && TT.substr(0, 4) == "armv")
     Idx = 4;
-  else if (Len >= 6 && TT.substr(0, 5) == "thumb") {
-    isThumb = true;
-    if (Len >= 7 && TT[5] == 'v')
-      Idx = 6;
-  }
+  else if (Len >= 7 && TT.substr(0, 6) == "armebv")
+    Idx = 6;
+  else if (Len >= 7 && TT.substr(0, 6) == "thumbv")
+    Idx = 6;
+  else if (Len >= 9 && TT.substr(0, 8) == "thumbebv")
+    Idx = 8;
 
   bool NoCPU = CPU == "generic" || CPU.empty();
   std::string ARMArchFeature;
@@ -129,9 +131,9 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
           ARMArchFeature = "+v7";
       } else if (Len >= Idx+2 && TT[Idx+1] == 's') {
         if (NoCPU)
-          // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+          // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS
           //      Swift
-          ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+t2xtpk";
+          ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras";
         else
           // Use CPU to figure out the exact features.
           ARMArchFeature = "+v7";
@@ -215,10 +217,37 @@ static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
 static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin())
-    return new ARMMCAsmInfoDarwin();
+  MCAsmInfo *MAI;
+  switch (TheTriple.getOS()) {
+  case llvm::Triple::Darwin:
+  case llvm::Triple::IOS:
+  case llvm::Triple::MacOSX:
+    MAI = new ARMMCAsmInfoDarwin(TT);
+    break;
+  case llvm::Triple::Win32:
+    switch (TheTriple.getEnvironment()) {
+    case llvm::Triple::Itanium:
+      MAI = new ARMCOFFMCAsmInfoGNU();
+      break;
+    case llvm::Triple::MSVC:
+      MAI = new ARMCOFFMCAsmInfoMicrosoft();
+      break;
+    default:
+      llvm_unreachable("invalid environment");
+    }
+    break;
+  default:
+    if (TheTriple.isOSBinFormatMachO())
+      MAI = new ARMMCAsmInfoDarwin(TT);
+    else
+      MAI = new ARMELFMCAsmInfo(TT);
+    break;
+  }
+
+  unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
+  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(0, Reg, 0));
 
-  return new ARMELFMCAsmInfo();
+  return MAI;
 }
 
 static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
@@ -239,12 +268,16 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
                                     raw_ostream &OS,
                                     MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, MAB, OS, Emitter, false);
+  if (TheTriple.isOSBinFormatMachO()) {
+    MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, false);
+    new ARMTargetStreamer(*S);
+    return S;
+  }
 
   if (TheTriple.isOSWindows()) {
     llvm_unreachable("ARM does not support Windows COFF format");
@@ -268,7 +301,7 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T,
 static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT,
                                                    MCContext &Ctx) {
   Triple TheTriple(TT);
-  if (TheTriple.isEnvironmentMachO())
+  if (TheTriple.isOSBinFormatMachO())
     return createARMMachORelocationInfo(Ctx);
   // Default to the stock relocation info.
   return llvm::createMCRelocationInfo(TT, Ctx);
@@ -280,14 +313,14 @@ class ARMMCInstrAnalysis : public MCInstrAnalysis {
 public:
   ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
 
-  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+  bool isUnconditionalBranch(const MCInst &Inst) const override {
     // BCCs with the "always" predicate are unconditional branches.
     if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
       return true;
     return MCInstrAnalysis::isUnconditionalBranch(Inst);
   }
 
-  virtual bool isConditionalBranch(const MCInst &Inst) const {
+  bool isConditionalBranch(const MCInst &Inst) const override {
     // BCCs with the "always" predicate are unconditional branches.
     if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
       return false;
@@ -295,7 +328,7 @@ public:
   }
 
   bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                      uint64_t Size, uint64_t &Target) const {
+                      uint64_t Size, uint64_t &Target) const override {
     // We only handle PCRel branches for now.
     if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
       return false;
@@ -316,56 +349,94 @@ static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
 // Force static initialization.
 extern "C" void LLVMInitializeARMTargetMC() {
   // Register the MC asm info.
-  RegisterMCAsmInfoFn A(TheARMTarget, createARMMCAsmInfo);
-  RegisterMCAsmInfoFn B(TheThumbTarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn X(TheARMLETarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheARMBETarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn A(TheThumbLETarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheThumbBETarget, createARMMCAsmInfo);
 
   // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheARMTarget, createARMMCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheThumbTarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMLETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMBETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbLETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbBETarget, createARMMCCodeGenInfo);
 
   // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARMLETarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARMBETarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbLETarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbBETarget, createARMMCInstrInfo);
 
   // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARMLETarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARMBETarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbLETarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbBETarget, createARMMCRegisterInfo);
 
   // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget,
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMLETarget,
                                           ARM_MC::createARMMCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget,
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMBETarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbLETarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbBETarget,
                                           ARM_MC::createARMMCSubtargetInfo);
 
   // Register the MC instruction analyzer.
-  TargetRegistry::RegisterMCInstrAnalysis(TheARMTarget,
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMLETarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMBETarget,
                                           createARMMCInstrAnalysis);
-  TargetRegistry::RegisterMCInstrAnalysis(TheThumbTarget,
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbLETarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbBETarget,
                                           createARMMCInstrAnalysis);
 
   // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARMLETarget,
+                                        createARMLEMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARMBETarget,
+                                        createARMBEMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbLETarget,
+                                        createARMLEMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbBETarget,
+                                        createARMBEMCCodeEmitter);
 
   // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheARMTarget, createARMAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheThumbTarget, createARMAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARMLETarget, createARMLEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARMBETarget, createARMBEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbLETarget,
+                                       createThumbLEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbBETarget,
+                                       createThumbBEAsmBackend);
 
   // Register the object streamer.
-  TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARMLETarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARMBETarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbLETarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbBETarget, createMCStreamer);
 
   // Register the asm streamer.
-  TargetRegistry::RegisterAsmStreamer(TheARMTarget, createMCAsmStreamer);
-  TargetRegistry::RegisterAsmStreamer(TheThumbTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheARMLETarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheARMBETarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer);
 
   // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARMBETarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbLETarget,
+                                        createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbBETarget,
+                                        createARMMCInstPrinter);
 
   // Register the MC relocation info.
-  TargetRegistry::RegisterMCRelocationInfo(TheARMTarget,
+  TargetRegistry::RegisterMCRelocationInfo(TheARMLETarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheARMBETarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheThumbLETarget,
                                            createARMMCRelocationInfo);
-  TargetRegistry::RegisterMCRelocationInfo(TheThumbTarget,
+  TargetRegistry::RegisterMCRelocationInfo(TheThumbBETarget,
                                            createARMMCRelocationInfo);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 959be8b..e81876f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -33,7 +33,8 @@ class StringRef;
 class Target;
 class raw_ostream;
 
-extern Target TheARMTarget, TheThumbTarget;
+extern Target TheARMLETarget, TheThumbLETarget;
+extern Target TheARMBETarget, TheThumbBETarget;
 
 namespace ARM_MC {
   std::string ParseARMTriple(StringRef TT, StringRef CPU);
@@ -46,22 +47,41 @@ namespace ARM_MC {
 }
 
 MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool isVerboseAsm, bool useCFI,
                                 bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst);
 
-MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCRegisterInfo &MRI,
-                                      const MCSubtargetInfo &STI,
-                                      MCContext &Ctx);
+MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+
+MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
 
 MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU,
+                                  bool IsLittleEndian);
+
+MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
+
+MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                   StringRef TT, StringRef CPU);
 
+MCAsmBackend *createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
+
+MCAsmBackend *createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
+
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
-                                         uint8_t OSABI);
+                                         uint8_t OSABI,
+                                         bool IsLittleEndian);
 
 /// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
 MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 807c948..d4b00e6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -9,10 +9,10 @@
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "ARMMCExpr.h"
+#include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCRelocationInfo.h"
-#include "llvm-c/Disassembler.h"
 
 using namespace llvm;
 using namespace object;
@@ -23,7 +23,7 @@ public:
   ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
 
   const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr,
-                                             unsigned VariantKind) {
+                                             unsigned VariantKind) override {
     switch(VariantKind) {
     case LLVMDisassembler_VariantKind_ARM_HI16:
       return ARMMCExpr::CreateUpper16(SubExpr, Ctx);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 1f681ba..3bf5cf1 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -56,7 +56,7 @@ public:
   void RecordRelocation(MachObjectWriter *Writer,
                         const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue);
+                        MCValue Target, uint64_t &FixedValue) override;
 };
 }
 
@@ -82,10 +82,14 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
     Log2Size = llvm::Log2_32(8);
     return true;
 
-    // Handle 24-bit branch kinds.
+    // These fixups are expected to always be resolvable at assembly time and
+    // have no relocations supported.
   case ARM::fixup_arm_ldst_pcrel_12:
   case ARM::fixup_arm_pcrel_10:
   case ARM::fixup_arm_adr_pcrel_12:
+    return false;
+
+    // Handle 24-bit branch kinds.
   case ARM::fixup_arm_condbranch:
   case ARM::fixup_arm_uncondbranch:
   case ARM::fixup_arm_uncondbl:
@@ -119,23 +123,19 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   //      0 - arm instructions
   //      1 - thumb instructions
   case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
     RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 1;
     return true;
   case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
     RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 3;
     return true;
 
   case ARM::fixup_arm_movw_lo16:
-  case ARM::fixup_arm_movw_lo16_pcrel:
     RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 0;
     return true;
   case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movw_lo16_pcrel:
     RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 2;
     return true;
@@ -202,7 +202,6 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   switch ((unsigned)Fixup.getKind()) {
   default: break;
   case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
     MovtBit = 1;
     // The thumb bit shouldn't be set in the 'other-half' bit of the
     // relocation, but it will be set in FixedValue if the base symbol
@@ -211,13 +210,11 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
       FixedValue &= 0xfffffffe;
     break;
   case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
     if (A_SD->getFlags() & SF_ThumbFunc)
       FixedValue &= 0xfffffffe;
     MovtBit = 1;
     // Fallthrough
   case ARM::fixup_t2_movw_lo16:
-  case ARM::fixup_t2_movw_lo16_pcrel:
     ThumbBit = 1;
     break;
   }
@@ -461,15 +458,11 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
     switch ((unsigned)Fixup.getKind()) {
     default: break;
     case ARM::fixup_arm_movw_lo16:
-    case ARM::fixup_arm_movw_lo16_pcrel:
     case ARM::fixup_t2_movw_lo16:
-    case ARM::fixup_t2_movw_lo16_pcrel:
       Value = (FixedValue >> 16) & 0xffff;
       break;
     case ARM::fixup_arm_movt_hi16:
-    case ARM::fixup_arm_movt_hi16_pcrel:
     case ARM::fixup_t2_movt_hi16:
-    case ARM::fixup_t2_movt_hi16_pcrel:
       Value = FixedValue & 0xffff;
       break;
     }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
new file mode 100644
index 0000000..fdc0ed7
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -0,0 +1,248 @@
+//===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+namespace {
+// A class to keep track of assembler-generated constant pools that are use to
+// implement the ldr-pseudo.
+class ConstantPool {
+  typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy;
+  EntryVecTy Entries;
+
+public:
+  // Initialize a new empty constant pool
+  ConstantPool() {}
+
+  // Add a new entry to the constant pool in the next slot.
+  // \param Value is the new entry to put in the constant pool.
+  //
+  // \returns a MCExpr that references the newly inserted value
+  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context);
+
+  // Emit the contents of the constant pool using the provided streamer.
+  void emitEntries(MCStreamer &Streamer);
+
+  // Return true if the constant pool is empty
+  bool empty();
+};
+}
+
+namespace llvm {
+class AssemblerConstantPools {
+  // Map type used to keep track of per-Section constant pools used by the
+  // ldr-pseudo opcode. The map associates a section to its constant pool. The
+  // constant pool is a vector of (label, value) pairs. When the ldr
+  // pseudo is parsed we insert a new (label, value) pair into the constant pool
+  // for the current section and add MCSymbolRefExpr to the new label as
+  // an opcode to the ldr. After we have parsed all the user input we
+  // output the (label, value) pairs in each constant pool at the end of the
+  // section.
+  //
+  // We use the MapVector for the map type to ensure stable iteration of
+  // the sections at the end of the parse. We need to iterate over the
+  // sections in a stable order to ensure that we have print the
+  // constant pools in a deterministic order when printing an assembly
+  // file.
+  typedef MapVector<const MCSection *, ConstantPool> ConstantPoolMapTy;
+  ConstantPoolMapTy ConstantPools;
+
+public:
+  AssemblerConstantPools() {}
+  ~AssemblerConstantPools() {}
+
+  void emitAll(MCStreamer &Streamer);
+  void emitForCurrentSection(MCStreamer &Streamer);
+  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr);
+
+private:
+  ConstantPool *getConstantPool(const MCSection *Section);
+  ConstantPool &getOrCreateConstantPool(const MCSection *Section);
+};
+}
+
+//
+// ConstantPool implementation
+//
+// Emit the contents of the constant pool using the provided streamer.
+void ConstantPool::emitEntries(MCStreamer &Streamer) {
+  if (Entries.empty())
+    return;
+  Streamer.EmitCodeAlignment(4); // align to 4-byte address
+  Streamer.EmitDataRegion(MCDR_DataRegion);
+  for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    Streamer.EmitLabel(I->first);
+    Streamer.EmitValue(I->second, 4);
+  }
+  Streamer.EmitDataRegion(MCDR_DataRegionEnd);
+  Entries.clear();
+}
+
+const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) {
+  MCSymbol *CPEntryLabel = Context.CreateTempSymbol();
+
+  Entries.push_back(std::make_pair(CPEntryLabel, Value));
+  return MCSymbolRefExpr::Create(CPEntryLabel, Context);
+}
+
+bool ConstantPool::empty() { return Entries.empty(); }
+
+//
+// AssemblerConstantPools implementation
+//
+ConstantPool *
+AssemblerConstantPools::getConstantPool(const MCSection *Section) {
+  ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
+  if (CP == ConstantPools.end())
+    return 0;
+
+  return &CP->second;
+}
+
+ConstantPool &
+AssemblerConstantPools::getOrCreateConstantPool(const MCSection *Section) {
+  return ConstantPools[Section];
+}
+
+static void emitConstantPool(MCStreamer &Streamer, const MCSection *Section,
+                             ConstantPool &CP) {
+  if (!CP.empty()) {
+    Streamer.SwitchSection(Section);
+    CP.emitEntries(Streamer);
+  }
+}
+
+void AssemblerConstantPools::emitAll(MCStreamer &Streamer) {
+  // Dump contents of assembler constant pools.
+  for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(),
+                                   CPE = ConstantPools.end();
+       CPI != CPE; ++CPI) {
+    const MCSection *Section = CPI->first;
+    ConstantPool &CP = CPI->second;
+
+    emitConstantPool(Streamer, Section, CP);
+  }
+}
+
+void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) {
+  const MCSection *Section = Streamer.getCurrentSection().first;
+  if (ConstantPool *CP = getConstantPool(Section)) {
+    emitConstantPool(Streamer, Section, *CP);
+  }
+}
+
+const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
+                                               const MCExpr *Expr) {
+  const MCSection *Section = Streamer.getCurrentSection().first;
+  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext());
+}
+
+//
+// ARMTargetStreamer Implemenation
+//
+ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+ARMTargetStreamer::~ARMTargetStreamer() {}
+
+// The constant pool handling is shared by all ARMTargetStreamer
+// implementations.
+const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
+  return ConstantPools->addEntry(Streamer, Expr);
+}
+
+void ARMTargetStreamer::emitCurrentConstantPool() {
+  ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+// The remaining callbacks should be handled separately by each
+// streamer.
+void ARMTargetStreamer::emitFnStart() {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitFnEnd() {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitCantUnwind() {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitHandlerData() {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                       int64_t Offset) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitPad(int64_t Offset) {
+  llvm_unreachable("unimplemented");
+}
+void
+ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                    bool isVector) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitUnwindRaw(
+    int64_t StackOffset, const SmallVectorImpl<uint8_t> &Opcodes) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::switchVendor(StringRef Vendor) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitTextAttribute(unsigned Attribute,
+                                               StringRef String) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                  unsigned IntValue,
+                                                  StringRef StringValue) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitArch(unsigned Arch) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitObjectArch(unsigned Arch) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitFPU(unsigned FPU) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::finishAttributeSection() {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {
+  llvm_unreachable("unimplemented");
+}
+void ARMTargetStreamer::AnnotateTLSDescriptorSequence(
+    const MCSymbolRefExpr *SRE) {
+  llvm_unreachable("unimplemented");
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h
deleted file mode 100644
index fa4add6..0000000
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h
+++ /dev/null
@@ -1,125 +0,0 @@
-//===-- ARMUnwindOp.h - ARM Unwind Opcodes ----------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the constants for the ARM unwind opcodes and exception
-// handling table entry kinds.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_UNWIND_OP_H
-#define ARM_UNWIND_OP_H
-
-namespace llvm {
-
-  /// ARM exception handling table entry kinds
-  enum ARMEHTEntryKind {
-    EHT_GENERIC = 0x00,
-    EHT_COMPACT = 0x80
-  };
-
-  enum {
-    /// Special entry for the function never unwind
-    EXIDX_CANTUNWIND = 0x1
-  };
-
-  /// ARM-defined frame unwinding opcodes
-  enum ARMUnwindOpcodes {
-    // Format: 00xxxxxx
-    // Purpose: vsp = vsp + ((x << 2) + 4)
-    UNWIND_OPCODE_INC_VSP = 0x00,
-
-    // Format: 01xxxxxx
-    // Purpose: vsp = vsp - ((x << 2) + 4)
-    UNWIND_OPCODE_DEC_VSP = 0x40,
-
-    // Format: 10000000 00000000
-    // Purpose: refuse to unwind
-    UNWIND_OPCODE_REFUSE = 0x8000,
-
-    // Format: 1000xxxx xxxxxxxx
-    // Purpose: pop r[15:12], r[11:4]
-    // Constraint: x != 0
-    UNWIND_OPCODE_POP_REG_MASK_R4 = 0x8000,
-
-    // Format: 1001xxxx
-    // Purpose: vsp = r[x]
-    // Constraint: x != 13 && x != 15
-    UNWIND_OPCODE_SET_VSP = 0x90,
-
-    // Format: 10100xxx
-    // Purpose: pop r[(4+x):4]
-    UNWIND_OPCODE_POP_REG_RANGE_R4 = 0xa0,
-
-    // Format: 10101xxx
-    // Purpose: pop r14, r[(4+x):4]
-    UNWIND_OPCODE_POP_REG_RANGE_R4_R14 = 0xa8,
-
-    // Format: 10110000
-    // Purpose: finish
-    UNWIND_OPCODE_FINISH = 0xb0,
-
-    // Format: 10110001 0000xxxx
-    // Purpose: pop r[3:0]
-    // Constraint: x != 0
-    UNWIND_OPCODE_POP_REG_MASK = 0xb100,
-
-    // Format: 10110010 x(uleb128)
-    // Purpose: vsp = vsp + ((x << 2) + 0x204)
-    UNWIND_OPCODE_INC_VSP_ULEB128 = 0xb2,
-
-    // Format: 10110011 xxxxyyyy
-    // Purpose: pop d[(x+y):x]
-    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDX = 0xb300,
-
-    // Format: 10111xxx
-    // Purpose: pop d[(8+x):8]
-    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDX_D8 = 0xb8,
-
-    // Format: 11000xxx
-    // Purpose: pop wR[(10+x):10]
-    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_RANGE_WR10 = 0xc0,
-
-    // Format: 11000110 xxxxyyyy
-    // Purpose: pop wR[(x+y):x]
-    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_RANGE = 0xc600,
-
-    // Format: 11000111 0000xxxx
-    // Purpose: pop wCGR[3:0]
-    // Constraint: x != 0
-    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_MASK = 0xc700,
-
-    // Format: 11001000 xxxxyyyy
-    // Purpose: pop d[(16+x+y):(16+x)]
-    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 = 0xc800,
-
-    // Format: 11001001 xxxxyyyy
-    // Purpose: pop d[(x+y):x]
-    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD = 0xc900,
-
-    // Format: 11010xxx
-    // Purpose: pop d[(8+x):8]
-    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D8 = 0xd0
-  };
-
-  /// ARM-defined Personality Routine Index
-  enum ARMPersonalityRoutineIndex {
-    // To make the exception handling table become more compact, ARM defined
-    // several personality routines in EHABI.  There are 3 different
-    // personality routines in ARM EHABI currently.  It is possible to have 16
-    // pre-defined personality routines at most.
-    AEABI_UNWIND_CPP_PR0 = 0,
-    AEABI_UNWIND_CPP_PR1 = 1,
-    AEABI_UNWIND_CPP_PR2 = 2,
-
-    NUM_PERSONALITY_INDEX
-  };
-
-}
-
-#endif // ARM_UNWIND_OP_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index c943370..593fe34 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -13,8 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMUnwindOpAsm.h"
-
-#include "ARMUnwindOp.h"
+#include "llvm/Support/ARMEHABI.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 
@@ -50,14 +49,15 @@ namespace {
 
     /// Emit the personality index prefix.
     inline void EmitPersonalityIndex(unsigned PI) {
-      assert(PI < NUM_PERSONALITY_INDEX && "Invalid personality prefix");
-      EmitByte(EHT_COMPACT | PI);
+      assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+             "Invalid personality prefix");
+      EmitByte(ARM::EHABI::EHT_COMPACT | PI);
     }
 
     /// Fill the rest of bytes with FINISH opcode.
     inline void FillFinishOpcode() {
       while (Pos < Vec.size())
-        EmitByte(UNWIND_OPCODE_FINISH);
+        EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH);
     }
   };
 }
@@ -85,22 +85,22 @@ void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
     uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask);
     if (UnmaskedReg == 0u) {
       // Pop r[4 : (4 + n)]
-      EmitInt8(UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
       RegSave &= 0x000fu;
     } else if (UnmaskedReg == (1u << 14)) {
       // Pop r[14] + r[4 : (4 + n)]
-      EmitInt8(UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
       RegSave &= 0x000fu;
     }
   }
 
   // Two bytes opcode to save register r15-r4
   if ((RegSave & 0xfff0u) != 0)
-    EmitInt16(UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
 
   // Opcode to save register r3-r0
   if ((RegSave & 0x000fu) != 0)
-    EmitInt16(UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
 }
 
 /// Emit unwind opcodes for .vsave directives
@@ -125,7 +125,7 @@ void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
       Bit >>= 1;
     }
 
-    EmitInt16(UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 |
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 |
               ((i - 16) << 4) | Range);
   }
 
@@ -147,34 +147,36 @@ void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
       Bit >>= 1;
     }
 
-    EmitInt16(UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | Range);
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) |
+              Range);
   }
 }
 
 /// Emit unwind opcodes to copy address from source register to $sp.
 void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) {
-  EmitInt8(UNWIND_OPCODE_SET_VSP | Reg);
+  EmitInt8(ARM::EHABI::UNWIND_OPCODE_SET_VSP | Reg);
 }
 
 /// Emit unwind opcodes to add $sp with an offset.
 void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
   if (Offset > 0x200) {
     uint8_t Buff[16];
-    Buff[0] = UNWIND_OPCODE_INC_VSP_ULEB128;
+    Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128;
     size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
     EmitBytes(Buff, ULEBSize + 1);
   } else if (Offset > 0) {
     if (Offset > 0x100) {
-      EmitInt8(UNWIND_OPCODE_INC_VSP | 0x3fu);
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu);
       Offset -= 0x100;
     }
-    EmitInt8(UNWIND_OPCODE_INC_VSP | static_cast<uint8_t>((Offset - 4) >> 2));
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP |
+             static_cast<uint8_t>((Offset - 4) >> 2));
   } else if (Offset < 0) {
     while (Offset < -0x100) {
-      EmitInt8(UNWIND_OPCODE_DEC_VSP | 0x3fu);
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP | 0x3fu);
       Offset += 0x100;
     }
-    EmitInt8(UNWIND_OPCODE_DEC_VSP |
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP |
              static_cast<uint8_t>(((-Offset) - 4) >> 2));
   }
 }
@@ -186,20 +188,23 @@ void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
 
   if (HasPersonality) {
     // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ]
-    PersonalityIndex = NUM_PERSONALITY_INDEX;
+    PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
     size_t TotalSize = Ops.size() + 1;
     size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
     Result.resize(RoundUpSize);
     OpStreamer.EmitSize(RoundUpSize);
   } else {
-    if (Ops.size() <= 3) {
+    // If no personalityindex is specified, select ane
+    if (PersonalityIndex == ARM::EHABI::NUM_PERSONALITY_INDEX)
+      PersonalityIndex = (Ops.size() <= 3) ? ARM::EHABI::AEABI_UNWIND_CPP_PR0
+                                           : ARM::EHABI::AEABI_UNWIND_CPP_PR1;
+    if (PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0) {
       // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ]
-      PersonalityIndex = AEABI_UNWIND_CPP_PR0;
+      assert(Ops.size() <= 3 && "too many opcodes for __aeabi_unwind_cpp_pr0");
       Result.resize(4);
       OpStreamer.EmitPersonalityIndex(PersonalityIndex);
     } else {
-      // __aeabi_unwind_cpp_pr1: [ 0x81 , SIZE , OP1 , OP2 , ... ]
-      PersonalityIndex = AEABI_UNWIND_CPP_PR1;
+      // __aeabi_unwind_cpp_pr{1,2}: [ {0x81,0x82} , SIZE , OP1 , OP2 , ... ]
       size_t TotalSize = Ops.size() + 2;
       size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
       Result.resize(RoundUpSize);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index ac67c6e..cd58759 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -15,10 +15,8 @@
 #ifndef ARM_UNWIND_OP_ASM_H
 #define ARM_UNWIND_OP_ASM_H
 
-#include "ARMUnwindOp.h"
-
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ARMEHABI.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -45,7 +43,7 @@ public:
     HasPersonality = 0;
   }
 
-  /// Set the personality index
+  /// Set the personality
   void setPersonality(const MCSymbol *Per) {
     HasPersonality = 1;
   }
@@ -62,6 +60,12 @@ public:
   /// Emit unwind opcodes to add $sp with an offset.
   void EmitSPOffset(int64_t Offset);
 
+  /// Emit unwind raw opcodes
+  void EmitRaw(const SmallVectorImpl<uint8_t> &Opcodes) {
+    Ops.insert(Ops.end(), Opcodes.begin(), Opcodes.end());
+    OpBegins.push_back(OpBegins.back() + Opcodes.size());
+  }
+
   /// Finalize the unwind opcode sequence for EmitBytes()
   void Finalize(unsigned &PersonalityIndex,
                 SmallVectorImpl<uint8_t> &Result);
diff --git a/lib/Target/ARM/MCTargetDesc/Android.mk b/lib/Target/ARM/MCTargetDesc/Android.mk
index faa482c..074d29e 100644
--- a/lib/Target/ARM/MCTargetDesc/Android.mk
+++ b/lib/Target/ARM/MCTargetDesc/Android.mk
@@ -16,6 +16,7 @@ arm_mc_desc_SRC_FILES := \
   ARMMCTargetDesc.cpp \
   ARMMachObjectWriter.cpp \
   ARMMachORelocationInfo.cpp \
+  ARMTargetStreamer.cpp \
   ARMUnwindOpAsm.cpp
 
 # For the host
@@ -40,6 +41,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -56,3 +58,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index bab59f4..06812d4 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -8,10 +8,7 @@ add_llvm_library(LLVMARMDesc
   ARMMCTargetDesc.cpp
   ARMMachObjectWriter.cpp
   ARMELFObjectWriter.cpp
+  ARMTargetStreamer.cpp
   ARMUnwindOpAsm.cpp
   ARMMachORelocationInfo.cpp
   )
-add_dependencies(LLVMARMDesc ARMCommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 2e266c2..80af859 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -40,9 +40,9 @@ namespace {
     static char ID;
     MLxExpansion() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "ARM MLA / MLS expansion pass";
     }
 
@@ -120,7 +120,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
     return Reg;
 
   MachineBasicBlock *MBB = MI->getParent();
-  MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg);
+  MachineInstr *UseMI = &*MRI->use_instr_nodbg_begin(Reg);
   if (UseMI->getParent() != MBB)
     return Reg;
 
@@ -129,7 +129,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
     if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
         !MRI->hasOneNonDBGUse(Reg))
       return Reg;
-    UseMI = &*MRI->use_nodbg_begin(Reg);
+    UseMI = &*MRI->use_instr_nodbg_begin(Reg);
     if (UseMI->getParent() != MBB)
       return Reg;
   }
@@ -312,9 +312,9 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
       dbgs() << "Expanding: " << *MI;
       dbgs() << "  to:\n";
       MachineBasicBlock::iterator MII = MI;
-      MII = llvm::prior(MII);
+      MII = std::prev(MII);
       MachineInstr &MI2 = *MII;
-      MII = llvm::prior(MII);
+      MII = std::prev(MII);
       MachineInstr &MI1 = *MII;
       dbgs() << "    " << MI1;
       dbgs() << "    " << MI2;
@@ -335,7 +335,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
   while (MII != E) {
     MachineInstr *MI = &*MII;
 
-    if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) {
+    if (MI->isPosition() || MI->isImplicitDef() || MI->isCopy()) {
       ++MII;
       continue;
     }
@@ -385,11 +385,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
   isSwift = STI->isSwift();
 
   bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
+  for (MachineBasicBlock &MBB : Fn)
     Modified |= ExpandFPMLxInstructions(MBB);
-  }
 
   return Modified;
 }
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index fa5681f..e464671 100644
--- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -7,17 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::TheARMTarget, llvm::TheThumbTarget;
+Target llvm::TheARMLETarget,   llvm::TheARMBETarget;
+Target llvm::TheThumbLETarget, llvm::TheThumbBETarget;
 
 extern "C" void LLVMInitializeARMTargetInfo() { 
   RegisterTarget<Triple::arm, /*HasJIT=*/true>
-    X(TheARMTarget, "arm", "ARM");
+    X(TheARMLETarget, "arm", "ARM");
+  RegisterTarget<Triple::armeb, /*HasJIT=*/true>
+    Y(TheARMBETarget, "armeb", "ARM (big endian)");
 
   RegisterTarget<Triple::thumb, /*HasJIT=*/true>
-    Y(TheThumbTarget, "thumb", "Thumb");
+    A(TheThumbLETarget, "thumb", "Thumb");
+  RegisterTarget<Triple::thumbeb, /*HasJIT=*/true>
+    B(TheThumbBETarget, "thumbeb", "Thumb (big endian)");
 }
diff --git a/lib/Target/ARM/TargetInfo/Android.mk b/lib/Target/ARM/TargetInfo/Android.mk
index de4416e..e31c2b8 100644
--- a/lib/Target/ARM/TargetInfo/Android.mk
+++ b/lib/Target/ARM/TargetInfo/Android.mk
@@ -32,6 +32,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -51,3 +52,4 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
index 533e747..03393a0 100644
--- a/lib/Target/ARM/TargetInfo/CMakeLists.txt
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMARMInfo
   ARMTargetInfo.cpp
   )
-
-add_dependencies(LLVMARMInfo ARMCommonTableGen)
diff --git a/lib/Target/ARM/TargetInfo/LLVMBuild.txt b/lib/Target/ARM/TargetInfo/LLVMBuild.txt
index a07a940..cce6cc7 100644
--- a/lib/Target/ARM/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/ARM/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = ARMInfo
 parent = ARM
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = ARM
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index cfb33f5..2224652 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
@@ -83,6 +84,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo  *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   const Thumb1RegisterInfo *RegInfo =
     static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
   const Thumb1InstrInfo &TII =
@@ -91,10 +94,13 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
+  assert(NumBytes >= ArgRegsSaveSize &&
+         "ArgRegsSaveSize is included in NumBytes");
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned BasePtr = RegInfo->getBaseRegister();
+  int CFAOffset = 0;
 
   // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
   NumBytes = (NumBytes + 3) & ~3;
@@ -105,14 +111,26 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
   int FramePtrSpillFI = 0;
 
-  if (ArgRegsSaveSize)
+  if (ArgRegsSaveSize) {
     emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
                  MachineInstr::FrameSetup);
+    CFAOffset -= ArgRegsSaveSize;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 
   if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+    if (NumBytes - ArgRegsSaveSize != 0) {
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize),
                    MachineInstr::FrameSetup);
+      CFAOffset -= NumBytes - ArgRegsSaveSize;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
     return;
   }
 
@@ -120,6 +138,15 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned Reg = CSI[i].getReg();
     int FI = CSI[i].getFrameIdx();
     switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (STI.isTargetMachO()) {
+        GPRCS2Size += 4;
+        break;
+      }
+      // fallthrough
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -129,17 +156,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
         FramePtrSpillFI = FI;
       GPRCS1Size += 4;
       break;
-    case ARM::R8:
-    case ARM::R9:
-    case ARM::R10:
-    case ARM::R11:
-      if (Reg == FramePtr)
-        FramePtrSpillFI = FI;
-      if (STI.isTargetIOS())
-        GPRCS2Size += 4;
-      else
-        GPRCS1Size += 4;
-      break;
     default:
       DPRCSSize += 8;
     }
@@ -152,7 +168,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   }
 
   // Determine starting offsets of spill areas.
-  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned DPRCSOffset  = NumBytes - ArgRegsSaveSize - (GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
   bool HasFP = hasFP(MF);
@@ -165,27 +181,89 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   NumBytes = DPRCSOffset;
 
   int FramePtrOffsetInBlock = 0;
-  if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) {
+  unsigned adjustedGPRCS1Size = GPRCS1Size;
+  if (tryFoldSPUpdateIntoPushPop(STI, MF, std::prev(MBBI), NumBytes)) {
     FramePtrOffsetInBlock = NumBytes;
+    adjustedGPRCS1Size += NumBytes;
     NumBytes = 0;
   }
 
+  if (adjustedGPRCS1Size) {
+    CFAOffset -= adjustedGPRCS1Size;
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I) {
+    unsigned Reg = I->getReg();
+    int FI = I->getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12:
+      if (STI.isTargetMachO())
+        break;
+      // fallthough
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      break;
+    }
+  }
+
+
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (HasFP) {
-    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
+    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI)
+		                     + GPRCS1Size + ArgRegsSaveSize;
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
       .setMIFlags(MachineInstr::FrameSetup));
+    if(FramePtrOffsetInBlock) {
+      CFAOffset += FramePtrOffsetInBlock;
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      unsigned CFIIndex =
+          MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
       // try restoring from fp instead.
       AFI->setShouldRestoreSPFromFP(true);
   }
 
-  if (NumBytes)
+  if (NumBytes) {
     // Insert it after all the callee-save spills.
     emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
                  MachineInstr::FrameSetup);
+    if (!HasFP) {
+      CFAOffset -= NumBytes;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
 
   if (STI.isTargetELF() && HasFP)
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
@@ -248,12 +326,14 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
+  assert((unsigned)NumBytes >= ArgRegsSaveSize &&
+         "ArgRegsSaveSize is included in NumBytes");
   const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
   if (!AFI->hasStackFrame()) {
-    if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+    if (NumBytes - ArgRegsSaveSize != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
     if (MBBI != MBB.begin()) {
@@ -267,7 +347,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     // Move SP to start of FP callee save spill area.
     NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
                  AFI->getGPRCalleeSavedArea2Size() +
-                 AFI->getDPRCalleeSavedAreaSize());
+                 AFI->getDPRCalleeSavedAreaSize() +
+                 ArgRegsSaveSize);
 
     if (AFI->shouldRestoreSPFromFP()) {
       NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
@@ -289,11 +370,11 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     } else {
       if (MBBI->getOpcode() == ARM::tBX_RET &&
           &MBB.front() != MBBI &&
-          prior(MBBI)->getOpcode() == ARM::tPOP) {
-        MachineBasicBlock::iterator PMBBI = prior(MBBI);
-        if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes))
+          std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+        MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
+        if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes))
           emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
-      } else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
+      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes))
         emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
     }
   }
@@ -304,9 +385,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
     // we need to update the SP after popping the value. Therefore, we
     // pop the old LR into R3 as a temporary.
 
-    // Move back past the callee-saved register restoration
-    while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs))
-      ++MBBI;
+    // Get the last instruction, tBX_RET
+    MBBI = MBB.getLastNonDebugInstr();
+    assert (MBBI->getOpcode() == ARM::tBX_RET);
     // Epilogue for vararg functions: pop LR to R3 and branch off it.
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
       .addReg(ARM::R3, RegState::Define);
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index 5a300af..f61874b 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -14,7 +14,6 @@
 #ifndef __THUMB_FRAMEINFO_H_
 #define __THUMB_FRAMEINFO_H_
 
-#include "ARM.h"
 #include "ARMFrameLowering.h"
 #include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
@@ -22,7 +21,6 @@
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-  class ARMSubtarget;
 
 class Thumb1FrameLowering : public ARMFrameLowering {
 public:
@@ -32,23 +30,24 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI) const;
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 22a925e..68cbb5c 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Thumb1InstrInfo.h"
-#include "ARM.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 36af204..c5845b7 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -14,7 +14,6 @@
 #ifndef THUMB1INSTRUCTIONINFO_H
 #define THUMB1INSTRUCTIONINFO_H
 
-#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "Thumb1RegisterInfo.h"
 
@@ -27,33 +26,33 @@ public:
   explicit Thumb1InstrInfo(const ARMSubtarget &STI);
 
   /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
-  unsigned getUnindexedOpcode(unsigned Opc) const;
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const Thumb1RegisterInfo &getRegisterInfo() const { return RI; }
+  const Thumb1RegisterInfo &getRegisterInfo() const override { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const;
+                   bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            const TargetRegisterInfo *TRI) const override;
 
 };
 }
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 65a7221..f907b14 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Thumb1RegisterInfo.h"
-#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
@@ -30,7 +29,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -421,7 +419,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
         MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Mask);
       }
       Offset = (Offset - Mask * Scale);
-      MachineBasicBlock::iterator NII = llvm::next(II);
+      MachineBasicBlock::iterator NII = std::next(II);
       emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII,
                                 *this);
     } else {
@@ -484,10 +482,8 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
   return Offset == 0;
 }
 
-void
-Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
-                                      unsigned BaseReg, int64_t Offset) const {
-  MachineInstr &MI = *I;
+void Thumb1RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                           int64_t Offset) const {
   const ARMBaseInstrInfo &TII =
     *static_cast<const ARMBaseInstrInfo*>(
       MI.getParent()->getParent()->getTarget().getInstrInfo());
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 9689b23..93e2b5a 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -15,7 +15,6 @@
 #ifndef THUMB1REGISTERINFO_H
 #define THUMB1REGISTERINFO_H
 
-#include "ARM.h"
 #include "ARMBaseRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -27,21 +26,20 @@ struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
   Thumb1RegisterInfo(const ARMSubtarget &STI);
 
-  const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
 
-  const TargetRegisterClass*
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
- void emitLoadConstPool(MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator &MBBI,
-                        DebugLoc dl,
-                        unsigned DestReg, unsigned SubIdx, int Val,
-                        ARMCC::CondCodes Pred = ARMCC::AL,
-                        unsigned PredReg = 0,
-                        unsigned MIFlags = MachineInstr::NoFlags) const;
+  void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val,
+                    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const override;
 
   // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
   // however much remains to be handled. Return 'true' if no further
@@ -49,16 +47,16 @@ public:
   bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
                          unsigned FrameReg, int &Offset,
                          const ARMBaseInstrInfo &TII) const;
-  void resolveFrameIndex(MachineBasicBlock::iterator I,
-                         unsigned BaseReg, int64_t Offset) const;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
                              MachineBasicBlock::iterator &UseMI,
                              const TargetRegisterClass *RC,
-                             unsigned Reg) const;
+                             unsigned Reg) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = NULL) const override;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 0b7d3bb..406dbe0 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -33,9 +33,9 @@ namespace {
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Thumb IT blocks insertion pass";
     }
 
@@ -242,7 +242,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
 
     // Finalize the bundle.
     MachineBasicBlock::instr_iterator LI = LastITMI;
-    finalizeBundle(MBB, InsertPos.getInstrIterator(), llvm::next(LI));
+    finalizeBundle(MBB, InsertPos.getInstrIterator(), std::next(LI));
 
     Modified = true;
     ++NumITs;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 91788ac..a9df006 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Thumb2InstrInfo.h"
-#include "ARM.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 2cdcd06..34d45d3 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -14,7 +14,6 @@
 #ifndef THUMB2INSTRUCTIONINFO_H
 #define THUMB2INSTRUCTIONINFO_H
 
-#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "Thumb2RegisterInfo.h"
 
@@ -28,40 +27,40 @@ public:
   explicit Thumb2InstrInfo(const ARMSubtarget &STI);
 
   /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
-  unsigned getUnindexedOpcode(unsigned Opc) const;
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
 
   void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
-                               MachineBasicBlock *NewDest) const;
+                               MachineBasicBlock *NewDest) const override;
 
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI) const;
+                           MachineBasicBlock::iterator MBBI) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const;
+                   bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            const TargetRegisterInfo *TRI) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
+  const Thumb2RegisterInfo &getRegisterInfo() const override { return RI; }
 };
 
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 4cb827f..782d81f 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -14,7 +14,6 @@
 
 #include "Thumb2RegisterInfo.h"
 #include "ARM.h"
-#include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -22,6 +21,8 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMSubtarget &sti)
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index b1d63fa..8a33e6c 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -15,9 +15,7 @@
 #ifndef THUMB2REGISTERINFO_H
 #define THUMB2REGISTERINFO_H
 
-#include "ARM.h"
 #include "ARMBaseRegisterInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
 
@@ -29,13 +27,11 @@ public:
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
-  void emitLoadConstPool(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         DebugLoc dl,
-                         unsigned DestReg, unsigned SubIdx, int Val,
-                         ARMCC::CondCodes Pred = ARMCC::AL,
-                         unsigned PredReg = 0,
-                         unsigned MIFlags = MachineInstr::NoFlags) const;
+  void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val,
+                    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const override;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 4795aae..04b83fb 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -10,7 +10,6 @@
 #define DEBUG_TYPE "t2-reduce-size"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMBaseRegisterInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "Thumb2InstrInfo.h"
@@ -23,7 +22,7 @@
 #include "llvm/IR/Function.h"        // To access Function attributes
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
@@ -137,9 +136,9 @@ namespace {
     const Thumb2InstrInfo *TII;
     const ARMSubtarget *STI;
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Thumb2 instruction size reduction pass";
     }
 
@@ -256,8 +255,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
     return HighLatencyCPSR || FirstInSelfLoop;
 
   SmallSet<unsigned, 2> Defs;
-  for (unsigned i = 0, e = CPSRDef->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = CPSRDef->getOperand(i);
+  for (const MachineOperand &MO : CPSRDef->operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
@@ -266,8 +264,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
     Defs.insert(Reg);
   }
 
-  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = Use->getOperand(i);
+  for (const MachineOperand &MO : Use->operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isDef())
       continue;
     unsigned Reg = MO.getReg();
@@ -858,8 +855,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
 
 static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
   bool HasDef = false;
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
       continue;
     if (MO.getReg() != ARM::CPSR)
@@ -874,8 +870,7 @@ static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
 }
 
 static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
+  for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || MO.isUndef() || MO.isDef())
       continue;
     if (MO.getReg() != ARM::CPSR)
@@ -945,7 +940,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
   MachineBasicBlock::instr_iterator NextMII;
   for (; MII != E; MII = NextMII) {
-    NextMII = llvm::next(MII);
+    NextMII = std::next(MII);
 
     MachineInstr *MI = &*MII;
     if (MI->isBundle()) {
@@ -962,7 +957,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 
     if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
       Modified = true;
-      MachineBasicBlock::instr_iterator I = prior(NextMII);
+      MachineBasicBlock::instr_iterator I = std::prev(NextMII);
       MI = &*I;
       // Removing and reinserting the first instruction in a bundle will break
       // up the bundle. Fix the bundling if it was broken.
@@ -980,6 +975,9 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
       if (MO && !MO->isDead())
         LiveCPSR = true;
+      MO = BundleMI->findRegisterUseOperand(ARM::CPSR);
+      if (MO && !MO->isKill())
+        LiveCPSR = true;
     }
 
     bool DefCPSR = false;
@@ -1012,8 +1010,7 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   AttributeSet FnAttrs = MF.getFunction()->getAttributes();
   OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::OptimizeForSize);
-  MinimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::MinSize);
+  MinimizeSize = STI->isMinSize();
 
   BlockInfo.clear();
   BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/lib/Target/ARM64/ARM64.h b/lib/Target/ARM64/ARM64.h
new file mode 100644
index 0000000..f2c5e60
--- /dev/null
+++ b/lib/Target/ARM64/ARM64.h
@@ -0,0 +1,48 @@
+//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM64_H
+#define TARGET_ARM64_H
+
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class ARM64TargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createARM64DeadRegisterDefinitions();
+FunctionPass *createARM64ConditionalCompares();
+FunctionPass *createARM64AdvSIMDScalar();
+FunctionPass *createARM64BranchRelaxation();
+FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createARM64StorePairSuppressPass();
+FunctionPass *createARM64ExpandPseudoPass();
+FunctionPass *createARM64LoadStoreOptimizationPass();
+ModulePass *createARM64PromoteConstantPass();
+FunctionPass *createARM64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM);
+
+FunctionPass *createARM64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createARM64CollectLOHPass();
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/ARM64.td b/lib/Target/ARM64/ARM64.td
new file mode 100644
index 0000000..3eef8b2
--- /dev/null
+++ b/lib/Target/ARM64/ARM64.td
@@ -0,0 +1,95 @@
+//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM64 Subtarget features.
+//
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zereo-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARM64RegisterInfo.td"
+include "ARM64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARM64Schedule.td"
+include "ARM64InstrInfo.td"
+
+def ARM64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// ARM64 Processors supported.
+//
+include "ARM64SchedCyclone.td"
+
+def : ProcessorModel<"arm64-generic", NoSchedModel, []>;
+
+def : ProcessorModel<"cyclone", CycloneModel, [FeatureZCRegMove, FeatureZCZeroing]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// ARM64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def ARM64 : Target {
+  let InstructionSet = ARM64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+}
diff --git a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp b/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
new file mode 100644
index 0000000..72fa6af
--- /dev/null
+++ b/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
@@ -0,0 +1,496 @@
+
+//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do so if the computations are markers with either nsw or nuw
+// markers.
+// Moreover, the current heuristic is simple: it does not create new sext
+// operations, i.e., it gives up when a sext would have forked (e.g., if
+// a = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-type-promotion"
+#include "ARM64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden,
+                           cl::desc("Enable the type promotion pass"),
+                           cl::init(true));
+static cl::opt<bool>
+EnableMerge("arm64-type-promotion-merge", cl::Hidden,
+            cl::desc("Enable merging of redundant sexts when one is dominating"
+                     " the other."),
+            cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                       ARM64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void initializeARM64AddressTypePromotionPass(PassRegistry &);
+}
+
+namespace {
+class ARM64AddressTypePromotion : public FunctionPass {
+
+public:
+  static char ID;
+  ARM64AddressTypePromotion()
+      : FunctionPass(ID), Func(NULL), ConsideredSExtType(NULL) {
+    initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual const char *getPassName() const {
+    return "ARM64 Address Type Promotion";
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F);
+
+private:
+  /// The current function.
+  Function *Func;
+  /// Filter out all sexts that does not have this type.
+  /// Currently initialized with Int64Ty.
+  Type *ConsideredSExtType;
+
+  // This transformation requires dominator info.
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+  typedef SmallVector<Instruction *, 16> Instructions;
+  typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+  /// Check if it is profitable to move a sext through this instruction.
+  /// Currently, we consider it is profitable if:
+  /// - Inst is used only once (no need to insert truncate).
+  /// - Inst has only one operand that will require a sext operation (we do
+  ///   do not create new sext operation).
+  bool shouldGetThrough(const Instruction *Inst);
+
+  /// Check if it is possible and legal to move a sext through this
+  /// instruction.
+  /// Current heuristic considers that we can get through:
+  /// - Arithmetic operation marked with the nsw or nuw flag.
+  /// - Other sext operation.
+  /// - Truncate operation if it was just dropping sign extended bits.
+  bool canGetThrough(const Instruction *Inst);
+
+  /// Move sext operations through safe to sext instructions.
+  bool propagateSignExtension(Instructions &SExtInsts);
+
+  /// Is this sext should be considered for code motion.
+  /// We look for sext with ConsideredSExtType and uses in at least one
+  // GetElementPtrInst.
+  bool shouldConsiderSExt(const Instruction *SExt) const;
+
+  /// Collect all interesting sext operations, i.e., the ones with the right
+  /// type and used in memory accesses.
+  /// More precisely, a sext instruction is considered as interesting if it
+  /// is used in a "complex" getelementptr or it exits at least another
+  /// sext instruction that sign extended the same initial value.
+  /// A getelementptr is considered as "complex" if it has more than 2
+  // operands.
+  void analyzeSExtension(Instructions &SExtInsts);
+
+  /// Merge redundant sign extension operations in common dominator.
+  void mergeSExts(ValueToInsts &ValToSExtendedUses,
+                  SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char ARM64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion",
+                      "ARM64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion",
+                    "ARM64 Type Promotion Pass", false, false)
+
+FunctionPass *llvm::createARM64AddressTypePromotionPass() {
+  return new ARM64AddressTypePromotion();
+}
+
+bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+  if (isa<SExtInst>(Inst))
+    return true;
+
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+    return true;
+
+  // sext(trunc(sext)) --> sext
+  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+    // Check that the truncate just drop sign extended bits.
+    if (Inst->getType()->getIntegerBitWidth() >=
+            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+            ConsideredSExtType->getIntegerBitWidth())
+      return true;
+  }
+
+  return false;
+}
+
+bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+  // If the type of the sext is the same as the considered one, this sext
+  // will become useless.
+  // Otherwise, we will have to do something to preserve the original value,
+  // unless it is used once.
+  if (isa<SExtInst>(Inst) &&
+      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+    return true;
+
+  // If the Inst is used more that once, we may need to insert truncate
+  // operations and we don't do that at the moment.
+  if (!Inst->hasOneUse())
+    return false;
+
+  // This truncate is used only once, thus if we can get thourgh, it will become
+  // useless.
+  if (isa<TruncInst>(Inst))
+    return true;
+
+  // If both operands are not constant, a new sext will be created here.
+  // Current heuristic is: each step should be profitable.
+  // Therefore we don't allow to increase the number of sext even if it may
+  // be profitable later on.
+  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  if (isa<SelectInst>(Inst) && OpIdx == 0)
+    return false;
+  return true;
+}
+
+bool
+ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+  if (SExt->getType() != ConsideredSExtType)
+    return false;
+
+  for (Value::const_use_iterator UseIt = SExt->use_begin(),
+                                 EndUseIt = SExt->use_end();
+       UseIt != EndUseIt; ++UseIt) {
+    if (isa<GetElementPtrInst>(*UseIt))
+      return true;
+  }
+
+  return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are use direclty in
+//   GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+//   Let var be the operand of sext.
+//   while it is profitable (see shouldGetThrough), legal, and safe
+//   (see canGetThrough) to move sext through var's definition:
+//   * promote the type of var's definition.
+//   * fold var into sext uses.
+//   * move sext above var's definition.
+//   * update sext operand to use the operand of var that should be sign
+//     extended (by construction there is only one).
+//
+//   E.g.,
+//   a = ... i32 c, 3
+//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+//   ...
+//   = b
+// => Yes, update the code
+//   b = sext i32 c to i64
+//   a = ... i64 b, 3
+//   ...
+//   = a
+// Iterate on 'c'.
+bool
+ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+  bool LocalChange = false;
+  SetOfInstructions ToRemove;
+  ValueToInsts ValToSExtendedUses;
+  while (!SExtInsts.empty()) {
+    // Get through simple chain.
+    Instruction *SExt = SExtInsts.pop_back_val();
+
+    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+    // If this SExt has already been merged continue.
+    if (SExt->use_empty() && ToRemove.count(SExt)) {
+      DEBUG(dbgs() << "No uses => marked as delete\n");
+      continue;
+    }
+
+    // Now try to get through the chain of definitions.
+    while (isa<Instruction>(SExt->getOperand(0))) {
+      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
+      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+        // We cannot get through something that is not an Instruction
+        // or not safe to SExt.
+        DEBUG(dbgs() << "Cannot get through\n");
+        break;
+      }
+
+      LocalChange = true;
+      // If this is a sign extend, it becomes useless.
+      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+        // We cannot use replaceAllUsesWith here because we may trigger some
+        // assertion on the type as all involved sext operation may have not
+        // been moved yet.
+        while (!Inst->use_empty()) {
+          Value::use_iterator UseIt = Inst->use_begin();
+          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
+          assert(UseInst && "Use of sext is not an Instruction!");
+          UseInst->setOperand(UseIt->getOperandNo(), SExt);
+        }
+        ToRemove.insert(Inst);
+        SExt->setOperand(0, Inst->getOperand(0));
+        SExt->moveBefore(Inst);
+        continue;
+      }
+
+      // Get through the Instruction:
+      // 1. Update its type.
+      // 2. Replace the uses of SExt by Inst.
+      // 3. Sign extend each operand that needs to be sign extended.
+
+      // Step #1.
+      Inst->mutateType(SExt->getType());
+      // Step #2.
+      SExt->replaceAllUsesWith(Inst);
+      // Step #3.
+      Instruction *SExtForOpnd = SExt;
+
+      DEBUG(dbgs() << "Propagate SExt to operands\n");
+      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+           ++OpIdx) {
+        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+            !shouldSExtOperand(Inst, OpIdx)) {
+          DEBUG(dbgs() << "No need to propagate\n");
+          continue;
+        }
+        // Check if we can statically sign extend the operand.
+        Value *Opnd = Inst->getOperand(OpIdx);
+        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+                                                         Cst->getSExtValue()));
+          continue;
+        }
+        // UndefValue are typed, so we have to statically sign extend them.
+        if (isa<UndefValue>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+          continue;
+        }
+
+        // Otherwise we have to explicity sign extend it.
+        assert(SExtForOpnd &&
+               "Only one operand should have been sign extended");
+
+        SExtForOpnd->setOperand(0, Opnd);
+
+        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+        // Move the sign extension before the insertion point.
+        SExtForOpnd->moveBefore(Inst);
+        Inst->setOperand(OpIdx, SExtForOpnd);
+        // If more sext are required, new instructions will have to be created.
+        SExtForOpnd = NULL;
+      }
+      if (SExtForOpnd == SExt) {
+        DEBUG(dbgs() << "Sign extension is useless now\n");
+        ToRemove.insert(SExt);
+        break;
+      }
+    }
+
+    // If the use is already of the right type, connect its uses to its argument
+    // and delete it.
+    // This can happen for an Instruction which all uses are sign extended.
+    if (!ToRemove.count(SExt) &&
+        SExt->getType() == SExt->getOperand(0)->getType()) {
+      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+                      "its argument\n");
+      SExt->replaceAllUsesWith(SExt->getOperand(0));
+      ToRemove.insert(SExt);
+    } else
+      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+  }
+
+  if (EnableMerge)
+    mergeSExts(ValToSExtendedUses, ToRemove);
+
+  // Remove all instructions marked as ToRemove.
+  for (Instruction *I: ToRemove)
+    I->eraseFromParent();
+  return LocalChange;
+}
+
+void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                           SetOfInstructions &ToRemove) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (auto &Entry: ValToSExtendedUses) {
+    Instructions &Insts = Entry.second;
+    Instructions CurPts;
+    for (Instruction *Inst : Insts) {
+      if (ToRemove.count(Inst))
+        continue;
+      bool inserted = false;
+      for (auto Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+                       << *Inst << '\n');
+          (Pt)->replaceAllUsesWith(Inst);
+          ToRemove.insert(Pt);
+          Pt = Inst;
+          inserted = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+
+        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+                     << *Pt << '\n');
+        Inst->replaceAllUsesWith(Pt);
+        ToRemove.insert(Inst);
+        inserted = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+}
+
+void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+  DenseMap<Value *, Instruction *> SeenChains;
+
+  for (auto &BB : *Func) {
+    for (auto &II: BB) {
+      Instruction *SExt = &II;
+
+      // Collect all sext operation per type.
+      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+        continue;
+
+      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+      // Cases where we actually perform the optimization:
+      // 1. SExt is used in a getelementptr with more than 2 operand =>
+      //    likely we can merge some computation if they are done on 64 bits.
+      // 2. The beginning of the SExt chain is SExt several time. =>
+      //    code sharing is possible.
+
+      bool insert = false;
+      // #1.
+      for (Value::use_iterator UseIt = SExt->use_begin(),
+                               EndUseIt = SExt->use_end();
+           UseIt != EndUseIt; ++UseIt) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(*UseIt);
+        if (Inst && Inst->getNumOperands() > 2) {
+          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+                       << '\n');
+          insert = true;
+          break;
+        }
+      }
+
+      // #2.
+      // Check the head of the chain.
+      Instruction *Inst = SExt;
+      Value *Last;
+      do {
+        int OpdIdx = 0;
+        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+          OpdIdx = 1;
+        Last = Inst->getOperand(OpdIdx);
+        Inst = dyn_cast<Instruction>(Last);
+      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+          SeenChains.find(Last);
+      if (insert || AlreadySeen != SeenChains.end()) {
+        DEBUG(dbgs() << "Insert\n");
+        SExtInsts.push_back(SExt);
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != NULL) {
+          DEBUG(dbgs() << "Insert chain member\n");
+          SExtInsts.push_back(AlreadySeen->second);
+          SeenChains[Last] = NULL;
+        }
+      } else {
+        DEBUG(dbgs() << "Record its chain membership\n");
+        SeenChains[Last] = SExt;
+      }
+    }
+  }
+}
+
+bool ARM64AddressTypePromotion::runOnFunction(Function &F) {
+  if (!EnableAddressTypePromotion || F.isDeclaration())
+    return false;
+  Func = &F;
+  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+  Instructions SExtInsts;
+  analyzeSExtension(SExtInsts);
+  return propagateSignExtension(SExtInsts);
+}
diff --git a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp b/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
new file mode 100644
index 0000000..83f8cda
--- /dev/null
+++ b/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
@@ -0,0 +1,392 @@
+//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly throrough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-simd-scalar"
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool>
+AdvSIMDScalar("arm64-simd-scalar",
+              cl::desc("enable use of AdvSIMD scalar integer instructions"),
+              cl::init(false), cl::Hidden);
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("arm64-simd-scalar-force-all",
+             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+             cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+namespace {
+class ARM64AdvSIMDScalar : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const ARM64InstrInfo *TII;
+
+private:
+  // isProfitableToTransform - Predicate function to determine whether an
+  // instruction should be transformed to its equivalent AdvSIMD scalar
+  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+  bool isProfitableToTransform(const MachineInstr *MI) const;
+
+  // tranformInstruction - Perform the transformation of an instruction
+  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to be the correct register class, minimizing cross-class copies.
+  void transformInstruction(MachineInstr *MI);
+
+  // processMachineBasicBlock - Main optimzation loop.
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F);
+
+  const char *getPassName() const {
+    return "AdvSIMD scalar operation optimization";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char ARM64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (SubReg)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass);
+  return ARM64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) &&
+            SubReg == 0) ||
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) &&
+            SubReg == ARM64::dsub);
+  // Physical register references just check the regist class directly.
+  return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static unsigned getSrcFromCopy(const MachineInstr *MI,
+                               const MachineRegisterInfo *MRI,
+                               unsigned &SubReg) {
+  SubReg = 0;
+  // The "FMOV Xd, Dn" instruction is the typical form.
+  if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr)
+    return MI->getOperand(1).getReg();
+  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+  // these at this stage, but it's easy to check for.
+  if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = ARM64::dsub;
+    return MI->getOperand(1).getReg();
+  }
+  // Or just a plain COPY instruction. This can be directly to/from FPR64,
+  // or it can be a dsub subreg reference to an FPR128.
+  if (MI->getOpcode() == ARM64::COPY) {
+    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+      return MI->getOperand(1).getReg();
+    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+                MRI)) {
+      SubReg = ARM64::dsub;
+      return MI->getOperand(1).getReg();
+    }
+  }
+
+  // Otherwise, this is some other kind of instruction.
+  return 0;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static int getTransformOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  // FIXME: Lots more possibilities.
+  case ARM64::ADDXrr:
+    return ARM64::ADDv1i64;
+  case ARM64::SUBXrr:
+    return ARM64::SUBv1i64;
+  }
+  // No AdvSIMD equivalent, so just return the original opcode.
+  return Opc;
+}
+
+static bool isTransformable(const MachineInstr *MI) {
+  int Opc = MI->getOpcode();
+  return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+  // early exit since that's the common case.
+  if (!isTransformable(MI))
+    return false;
+
+  // Count the number of copies we'll need to add and approximate the number
+  // of copies that a transform will enable us to remove.
+  unsigned NumNewCopies = 3;
+  unsigned NumRemovableCopies = 0;
+
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If the source was from a copy, we don't need to insert a new copy.
+    if (Src0)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+      ++NumRemovableCopies;
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (Src1)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+      ++NumRemovableCopies;
+  }
+
+  // If any of the uses of the original instructions is a cross class copy,
+  // that's a copy that will be removable if we transform. Likewise, if
+  // any of the uses is a transformable instruction, it's likely the tranforms
+  // will chain, enabling us to save a copy there, too. This is an aggressive
+  // heuristic that approximates the graph based cost analysis described above.
+  unsigned Dst = MI->getOperand(0).getReg();
+  bool AllUsesAreCopies = true;
+  for (MachineRegisterInfo::use_instr_nodbg_iterator
+           Use = MRI->use_instr_nodbg_begin(Dst),
+           E = MRI->use_instr_nodbg_end();
+       Use != E; ++Use) {
+    unsigned SubReg;
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+      ++NumRemovableCopies;
+    // If the use is an INSERT_SUBREG, that's still something that can
+    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+    // preferable to have it use the FPR64 in most cases, as if the source
+    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+    // Ditto for a lane insert.
+    else if (Use->getOpcode() == ARM64::INSERT_SUBREG ||
+             Use->getOpcode() == ARM64::INSvi64gpr)
+      ;
+    else
+      AllUsesAreCopies = false;
+  }
+  // If all of the uses of the original destination register are copies to
+  // FPR64, then we won't end up having a new copy back to GPR64 either.
+  if (AllUsesAreCopies)
+    --NumNewCopies;
+
+  // If a tranform will not increase the number of cross-class copies required,
+  // return true.
+  if (NumNewCopies <= NumRemovableCopies)
+    return true;
+
+  // Finally, even if we otherwise wouldn't transform, check if we're forcing
+  // transformation of everything.
+  return TransformAll;
+}
+
+static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
+                                unsigned Dst, unsigned Src, bool IsKill) {
+  MachineInstrBuilder MIB =
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY),
+              Dst)
+          .addReg(Src, getKillRegState(IsKill));
+  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  ++NumCopiesInserted;
+  return MIB;
+}
+
+// tranformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+  DEBUG(dbgs() << "Scalar transform: " << *MI);
+
+  MachineBasicBlock *MBB = MI->getParent();
+  int OldOpc = MI->getOpcode();
+  int NewOpc = getTransformOpcode(OldOpc);
+  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+  // Check if we need a copy for the source registers.
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
+      assert(Src0 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
+      assert(Src1 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  // If we weren't able to reference the original source directly, create a
+  // copy.
+  if (!Src0) {
+    SubReg0 = 0;
+    Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    insertCopy(TII, MI, Src0, OrigSrc0, true);
+  }
+  if (!Src1) {
+    SubReg1 = 0;
+    Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    insertCopy(TII, MI, Src1, OrigSrc1, true);
+  }
+
+  // Create a vreg for the destination.
+  // FIXME: No need to do this if the ultimate user expects an FPR64.
+  // Check for that and avoid the copy if possible.
+  unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+
+  // For now, all of the new instructions have the same simple three-register
+  // form, so no need to special case based on what instruction we're
+  // building.
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(true), SubReg0)
+      .addReg(Src1, getKillRegState(true), SubReg1);
+
+  // Now copy the result back out to a GPR.
+  // FIXME: Try to avoid this if all uses could actually just use the FPR64
+  // directly.
+  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+
+  // Erase the old instruction.
+  MI->eraseFromParent();
+
+  ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = I;
+    ++I;
+    if (isProfitableToTransform(MI)) {
+      transformInstruction(MI);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+  // Early exit if pass disabled.
+  if (!AdvSIMDScalar)
+    return false;
+
+  bool Changed = false;
+  DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n");
+
+  const TargetMachine &TM = mf.getTarget();
+  MRI = &mf.getRegInfo();
+  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+
+  // Just check things on a one-block-at-a-time basis.
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createARM64AdvSIMDScalar() {
+  return new ARM64AdvSIMDScalar();
+}
diff --git a/lib/Target/ARM64/ARM64AsmPrinter.cpp b/lib/Target/ARM64/ARM64AsmPrinter.cpp
new file mode 100644
index 0000000..d0aa6af
--- /dev/null
+++ b/lib/Target/ARM64/ARM64AsmPrinter.cpp
@@ -0,0 +1,563 @@
+//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the ARM64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM64.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64MCInstLower.h"
+#include "ARM64RegisterInfo.h"
+#include "InstPrinter/ARM64InstPrinter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace {
+
+class ARM64AsmPrinter : public AsmPrinter {
+  ARM64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer), MCInstLowering(OutContext, *Mang, *this),
+        SM(*this), ARM64FI(NULL), LOHLabelCounter(0) {}
+
+  virtual const char *getPassName() const { return "ARM64 Assembly Printer"; }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) {
+    ARM64FI = F.getInfo<ARM64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O);
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd();
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const;
+  void EmitEndOfAsmFile(Module &M);
+  ARM64FunctionInfo *ARM64FI;
+
+  /// \brief Emit the LOHs contained in ARM64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never
+  // generates code that does this, it is always safe to set.
+  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  SM.serializeToStackMapSection();
+}
+
+MachineLocation
+ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+void ARM64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
+
+  for (const auto &D : ARM64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
+    }
+    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
+  }
+}
+
+void ARM64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!ARM64FI->getLOHRelated().empty())
+    EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << ARM64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
+    break;
+  }
+  }
+}
+
+bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                        raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
+    break;
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
+    break;
+  }
+
+  O << ARM64InstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                         const TargetRegisterClass *RC,
+                                         bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const ARM64RegisterInfo *RI =
+      static_cast<const ARM64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << ARM64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName);
+  return false;
+}
+
+bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR;
+        O << ARM64InstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &ARM64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &ARM64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &ARM64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &ARM64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &ARM64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    }
+  }
+
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (ARM64::GPR32allRegClass.contains(Reg) ||
+        ARM64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O);
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNum, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                             raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                    const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "ARM64GenMCPseudoLowering.inc"
+
+static unsigned getRealIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM64::LDRXpre_isel:    return ARM64::LDRXpre;
+  case ARM64::LDRWpre_isel:    return ARM64::LDRWpre;
+  case ARM64::LDRDpre_isel:    return ARM64::LDRDpre;
+  case ARM64::LDRSpre_isel:    return ARM64::LDRSpre;
+  case ARM64::LDRBBpre_isel:   return ARM64::LDRBBpre;
+  case ARM64::LDRHHpre_isel:   return ARM64::LDRHHpre;
+  case ARM64::LDRSBWpre_isel:  return ARM64::LDRSBWpre;
+  case ARM64::LDRSBXpre_isel:  return ARM64::LDRSBXpre;
+  case ARM64::LDRSHWpre_isel:  return ARM64::LDRSHWpre;
+  case ARM64::LDRSHXpre_isel:  return ARM64::LDRSHXpre;
+  case ARM64::LDRSWpre_isel:   return ARM64::LDRSWpre;
+
+  case ARM64::LDRDpost_isel:   return ARM64::LDRDpost;
+  case ARM64::LDRSpost_isel:   return ARM64::LDRSpost;
+  case ARM64::LDRXpost_isel:   return ARM64::LDRXpost;
+  case ARM64::LDRWpost_isel:   return ARM64::LDRWpost;
+  case ARM64::LDRHHpost_isel:  return ARM64::LDRHHpost;
+  case ARM64::LDRBBpost_isel:  return ARM64::LDRBBpost;
+  case ARM64::LDRSWpost_isel:  return ARM64::LDRSWpost;
+  case ARM64::LDRSHWpost_isel: return ARM64::LDRSHWpost;
+  case ARM64::LDRSHXpost_isel: return ARM64::LDRSHXpost;
+  case ARM64::LDRSBWpost_isel: return ARM64::LDRSBWpost;
+  case ARM64::LDRSBXpost_isel: return ARM64::LDRSBXpost;
+
+  case ARM64::STRXpre_isel:    return ARM64::STRXpre;
+  case ARM64::STRWpre_isel:    return ARM64::STRWpre;
+  case ARM64::STRHHpre_isel:   return ARM64::STRHHpre;
+  case ARM64::STRBBpre_isel:   return ARM64::STRBBpre;
+  case ARM64::STRDpre_isel:    return ARM64::STRDpre;
+  case ARM64::STRSpre_isel:    return ARM64::STRSpre;
+  }
+  llvm_unreachable("Unexpected pre-indexed opcode!");
+}
+
+void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  if (ARM64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
+
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  // Indexed loads and stores use a pseudo to handle complex operand
+  // tricks and writeback to the base register. We strip off the writeback
+  // operand and switch the opcode here. Post-indexed stores were handled by the
+  // tablegen'erated pseudos above. (The complex operand <--> simple
+  // operand isel is beyond tablegen's ability, so we do these manually).
+  case ARM64::LDRHHpre_isel:
+  case ARM64::LDRBBpre_isel:
+  case ARM64::LDRXpre_isel:
+  case ARM64::LDRWpre_isel:
+  case ARM64::LDRDpre_isel:
+  case ARM64::LDRSpre_isel:
+  case ARM64::LDRSBWpre_isel:
+  case ARM64::LDRSBXpre_isel:
+  case ARM64::LDRSHWpre_isel:
+  case ARM64::LDRSHXpre_isel:
+  case ARM64::LDRSWpre_isel:
+  case ARM64::LDRDpost_isel:
+  case ARM64::LDRSpost_isel:
+  case ARM64::LDRXpost_isel:
+  case ARM64::LDRWpost_isel:
+  case ARM64::LDRHHpost_isel:
+  case ARM64::LDRBBpost_isel:
+  case ARM64::LDRSWpost_isel:
+  case ARM64::LDRSHWpost_isel:
+  case ARM64::LDRSHXpost_isel:
+  case ARM64::LDRSBWpost_isel:
+  case ARM64::LDRSBXpost_isel: {
+    MCInst TmpInst;
+    // For loads, the writeback operand to be skipped is the second.
+    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
+    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM64::STRXpre_isel:
+  case ARM64::STRWpre_isel:
+  case ARM64::STRHHpre_isel:
+  case ARM64::STRBBpre_isel:
+  case ARM64::STRDpre_isel:
+  case ARM64::STRSpre_isel: {
+    MCInst TmpInst;
+    // For loads, the writeback operand to be skipped is the first.
+    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
+    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case ARM64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case ARM64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(ARM64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
+
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(ARM64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
+
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+  }
+
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARM64AsmPrinter() {
+  RegisterAsmPrinter<ARM64AsmPrinter> X(TheARM64Target);
+}
diff --git a/lib/Target/ARM64/ARM64BranchRelaxation.cpp b/lib/Target/ARM64/ARM64BranchRelaxation.cpp
new file mode 100644
index 0000000..a9bbef5
--- /dev/null
+++ b/lib/Target/ARM64/ARM64BranchRelaxation.cpp
@@ -0,0 +1,505 @@
+//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-branch-relax"
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true),
+                 cl::desc("Relax out of range conditional branches"));
+
+static cl::opt<unsigned>
+TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14),
+                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+
+namespace {
+class ARM64BranchRelaxation : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// The offset is always aligned as required by the basic block.
+    unsigned Offset;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size;
+
+    BasicBlockInfo() : Offset(0), Size(0) {}
+
+    /// Compute the offset immediately following this block.  If LogAlign is
+    /// specified, return the offset the successor block will get if it has
+    /// this alignment.
+    unsigned postOffset(unsigned LogAlign = 0) const {
+      unsigned PO = Offset + Size;
+      unsigned Align = 1 << LogAlign;
+      return (PO + Align - 1) / Align * Align;
+    }
+  };
+
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+
+  MachineFunction *MF;
+  const ARM64InstrInfo *TII;
+
+  bool relaxBranchInstructions();
+  void scanFunction();
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+  void adjustBlockOffsets(MachineBasicBlock *BB);
+  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+  bool fixupConditionalBranch(MachineInstr *MI);
+  void computeBlockSize(MachineBasicBlock *MBB);
+  unsigned getInstrOffset(MachineInstr *MI) const;
+  void dumpBBs();
+  void verify();
+
+public:
+  static char ID;
+  ARM64BranchRelaxation() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "ARM64 branch relaxation pass";
+  }
+};
+char ARM64BranchRelaxation::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARM64BranchRelaxation::verify() {
+#ifndef NDEBUG
+  unsigned PrevNum = MF->begin()->getNumber();
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E;
+       ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned Align = MBB->getAlignment();
+    unsigned Num = MBB->getNumber();
+    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
+    PrevNum = Num;
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARM64BranchRelaxation::dumpBBs() {
+  for (auto &MBB: *MF) {
+    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
+    dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+           << format("size=%#x\n", BBI.Size);
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void ARM64BranchRelaxation::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+  // Compute block offsets and known bits.
+  adjustBlockOffsets(MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+/// This function updates BlockInfo directly.
+void ARM64BranchRelaxation::computeBlockSize(MachineBasicBlock *MBB) {
+  unsigned Size = 0;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I)
+    Size += TII->GetInstSizeInBytes(I);
+  BlockInfo[MBB->getNumber()].Size = Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function.  This offset changes as stuff is
+/// moved around inside the function.
+unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock *Start) {
+  unsigned PrevNum = Start->getNumber();
+  MachineFunction::iterator MBBI = Start, E = MF->end();
+  for (++MBBI; MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned Num = MBB->getNumber();
+    if (!Num) // block zero is never changed from offset zero.
+      continue;
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MBBI->getAlignment();
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
+    PrevNum = Num;
+  }
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+/// NOTE: Successor list of the original BB is out of date after this function,
+/// and must be updated by the caller! Other transforms follow using this
+/// utility function, so no point updating now rather than waiting.
+MachineBasicBlock *
+ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB;
+  ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBlockOffsets(OrigBB);
+
+  ++NumSplit;
+
+  return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                           MachineBasicBlock *DestBB,
+                                           unsigned Bits) {
+  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
+  unsigned BrOffset = getInstrOffset(MI);
+  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
+               << " to " << DestOffset << " offset "
+               << int(DestOffset - BrOffset) << "\t" << *MI);
+
+  // Branch before the Dest.
+  if (BrOffset <= DestOffset)
+    return (DestOffset - BrOffset <= MaxOffs);
+  return (BrOffset - DestOffset <= MaxOffs);
+}
+
+static bool isConditionalBranch(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+  case ARM64::Bcc:
+    return true;
+  }
+}
+
+static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+    return MI->getOperand(2).getMBB();
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+  case ARM64::Bcc:
+    return MI->getOperand(1).getMBB();
+  }
+}
+
+static unsigned getOppositeConditionOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case ARM64::TBNZ:    return ARM64::TBZ;
+  case ARM64::TBZ:     return ARM64::TBNZ;
+  case ARM64::CBNZW:   return ARM64::CBZW;
+  case ARM64::CBNZX:   return ARM64::CBZX;
+  case ARM64::CBZW:    return ARM64::CBNZW;
+  case ARM64::CBZX:    return ARM64::CBNZX;
+  case ARM64::Bcc:     return ARM64::Bcc; // Condition is an operand for Bcc.
+  }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case ARM64::TBNZ:
+  case ARM64::TBZ:
+    return TBZDisplacementBits;
+  case ARM64::CBNZW:
+  case ARM64::CBZW:
+  case ARM64::CBNZX:
+  case ARM64::CBZX:
+    return CBZDisplacementBits;
+  case ARM64::Bcc:
+    return BCCDisplacementBits;
+  }
+}
+
+static inline void invertBccCondition(MachineInstr *MI) {
+  assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!");
+  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm();
+  CC = ARM64CC::getInvertedCondCode(CC);
+  MI->getOperand(0).setImm((int64_t)CC);
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+  MachineBasicBlock *DestBB = getDestBlock(MI);
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // tbz L1
+  // =>
+  // tbnz L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) ==
+            std::prev(MBB->getLastNonDebugInstr()) &&
+        BMI->getOpcode() == ARM64::B) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBlockInRange(MI, NewDest,
+                         getBranchDisplacementBits(MI->getOpcode()))) {
+        DEBUG(dbgs() << "  Invert condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        unsigned OpNum =
+            (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
+                ? 2
+                : 1;
+        MI->getOperand(OpNum).setMBB(NewDest);
+        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
+        if (MI->getOpcode() == ARM64::Bcc)
+          invertBccCondition(MI);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    // Analyze the branch so we know how to update the successor lists.
+    MachineBasicBlock *TBB, *FBB;
+    SmallVector<MachineOperand, 2> Cond;
+    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+
+    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BlockInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+    // Update the successor lists according to the transformation to follow.
+    // Do it here since if there's no split, no update is needed.
+    MBB->replaceSuccessor(FBB, NewBB);
+    NewBB->addSuccessor(FBB);
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << ", invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  MachineInstrBuilder MIB = BuildMI(
+      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
+                                .addOperand(MI->getOperand(0));
+  if (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
+    MIB.addOperand(MI->getOperand(1));
+  if (MI->getOpcode() == ARM64::Bcc)
+    invertBccCondition(MIB);
+  MIB.addMBB(NextBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // Finally, keep the block offsets up to date.
+  adjustBlockOffsets(MBB);
+  return true;
+}
+
+bool ARM64BranchRelaxation::relaxBranchInstructions() {
+  bool Changed = false;
+  // Relaxing branches involves creating new basic blocks, so re-eval
+  // end() for termination.
+  for (auto &MBB : *MF) {
+    MachineInstr *MI = MBB.getFirstTerminator();
+    if (isConditionalBranch(MI->getOpcode()) &&
+        !isBlockInRange(MI, getDestBlock(MI),
+                        getBranchDisplacementBits(MI->getOpcode()))) {
+      fixupConditionalBranch(MI);
+      ++NumRelaxed;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+
+  // If the pass is disabled, just bail early.
+  if (!BranchRelaxation)
+    return false;
+
+  DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n");
+
+  TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block.
+  scanFunction();
+
+  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
+  DEBUG(dumpBBs());
+
+  bool MadeChange = false;
+  while (relaxBranchInstructions())
+    MadeChange = true;
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BlockInfo.clear();
+
+  return MadeChange;
+}
+
+/// createARM64BranchRelaxation - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARM64BranchRelaxation() {
+  return new ARM64BranchRelaxation();
+}
diff --git a/lib/Target/ARM64/ARM64CallingConv.h b/lib/Target/ARM64/ARM64CallingConv.h
new file mode 100644
index 0000000..0128236
--- /dev/null
+++ b/lib/Target/ARM64/ARM64CallingConv.h
@@ -0,0 +1,94 @@
+//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM64 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64CALLINGCONV_H
+#define ARM64CALLINGCONV_H
+
+#include "ARM64InstrInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
+/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
+/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
+/// argument to i32 if we are sure this argument will be passed in register.
+static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags,
+                                        CCState &State,
+                                        bool IsWebKitJS = false) {
+  static const uint16_t RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2,
+                                       ARM64::W3, ARM64::W4, ARM64::W5,
+                                       ARM64::W6, ARM64::W7 };
+  static const uint16_t RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2,
+                                       ARM64::X3, ARM64::X4, ARM64::X5,
+                                       ARM64::X6, ARM64::X7 };
+  static const uint16_t WebKitRegList1[] = { ARM64::W0 };
+  static const uint16_t WebKitRegList2[] = { ARM64::X0 };
+
+  const uint16_t *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
+  const uint16_t *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
+
+  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
+    // Customized extra section for handling i1/i8/i16:
+    // We need to promote the argument to i32 if it is not done already.
+    if (ValVT != MVT::i32) {
+      if (ArgFlags.isSExt())
+        LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+        LocInfo = CCValAssign::ZExt;
+      else
+        LocInfo = CCValAssign::AExt;
+      ValVT = MVT::i32;
+    }
+    // Set LocVT to i32 as well if passing via register.
+    LocVT = MVT::i32;
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false;
+}
+
+/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
+/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only
+/// uses the first register.
+static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                           CCValAssign::LocInfo LocInfo,
+                                           ISD::ArgFlagsTy ArgFlags,
+                                           CCState &State) {
+  return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                     State, true);
+}
+
+/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
+/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
+/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
+/// it will be truncated back to i1/i8/i16.
+static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                          CCValAssign::LocInfo LocInfo,
+                                          ISD::ArgFlagsTy ArgFlags,
+                                          CCState &State) {
+  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
+  unsigned Offset12 = State.AllocateStack(Space, Space);
+  ValVT = LocVT;
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/ARM64/ARM64CallingConvention.td b/lib/Target/ARM64/ARM64CallingConvention.td
new file mode 100644
index 0000000..9ac888f
--- /dev/null
+++ b/lib/Target/ARM64/ARM64CallingConvention.td
@@ -0,0 +1,210 @@
+//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for ARM64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+                                                    [X0, X1, X3, X5]>>>,
+
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_ARM64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+//     + i128s (i.e. split i64s) don't need even registers.
+//     + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_ARM64_DarwinPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64],
+           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i64 or f64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f32],          CCPromoteToType<f64>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_ARM64_WebKit_JS : CallingConv<[
+  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+  // Pass the remaining arguments on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_ARM64_WebKit_JS : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                           X23, X24, X25, X26, X27, X28,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_ARM64_TLS_Darwin
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_ARM64_TLS_ELF
+    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_ARM64_AllRegs
+    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+                           (sequence "X%u", 0, 28), FP, LR, SP,
+                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+                           (sequence "Q%u", 0, 31))>;
+
diff --git a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp b/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 0000000..e3f8248
--- /dev/null
+++ b/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,147 @@
+//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+//    in much the same way as a general-dynamic TLS-descriptor access against
+//    the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+//    instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+//    true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) {
+    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case ARM64::TLSDESC_BLR:
+        // Make sure it's a local dynamic access.
+        if (!I->getOperand(1).isSymbol() ||
+            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+          break;
+
+        if (TLSBaseAddrReg)
+          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+        else
+          I = setRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Visit the children of this block in the dominator tree.
+    for (MachineDomTreeNode *N : *Node) {
+      Changed |= VisitNode(N, TLSBaseAddrReg);
+    }
+
+    return Changed;
+  }
+
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const ARM64TargetMachine *TM =
+        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
+    const ARM64InstrInfo *TII = TM->getInstrInfo();
+
+    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+    // code sequence assumes the address will be.
+    MachineInstr *Copy =
+        BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg);
+
+    // Erase the TLS_base_addr instruction.
+    I->eraseFromParent();
+
+    return Copy;
+  }
+
+  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const ARM64TargetMachine *TM =
+        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
+    const ARM64InstrInfo *TII = TM->getInstrInfo();
+
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+
+    // Insert a copy from X0 to TLSBaseAddrReg for later.
+    MachineInstr *Next = I->getNextNode();
+    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 *TLSBaseAddrReg).addReg(ARM64::X0);
+
+    return Copy;
+  }
+
+  virtual const char *getPassName() const {
+    return "Local Dynamic TLS Access Clean-up";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
diff --git a/lib/Target/ARM64/ARM64CollectLOH.cpp b/lib/Target/ARM64/ARM64CollectLOH.cpp
new file mode 100644
index 0000000..f52778f
--- /dev/null
+++ b/lib/Target/ARM64/ARM64CollectLOH.cpp
@@ -0,0 +1,1157 @@
+//===-------------- ARM64CollectLOH.cpp - ARM64 collect LOH pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+//     L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+//     L1: adr xA, sym
+//     L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+//   respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+//   only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+//   @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+//   - .loh AdrpAddLdr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdrGotLdr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdr L1, L3:
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xA, sym@PAGEOFF]
+//   - .loh AdrpAddStr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpLdrGotStr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpAdd L1, L2:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//   For all these LOHs, L1, L2, L3 form a simple chain:
+//   L1 result is used only by L2 and L2 result by L3.
+//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+//   by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+//  - .loh AdrpAdrp L2, L1:
+//    L2: ADRP xA, sym1@PAGE
+//    L1: ADRP xA, sym2@PAGE
+//    L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. ARM64CollectLOH (this pass) records the LOHs in the ARM64FunctionInfo.
+// 2. ARM64AsmPrinter reads the LOHs from ARM64FunctionInfo and it:
+//     1. Associates them a label.
+//     2. Emits them in a MCStreamer (EmitLOHDirective).
+//         - The MCMachOStreamer records them into the MCAssembler.
+//         - The MCAsmStreamer prints them.
+//         - Other MCStreamers ignore them.
+//     3. Closes the MCStreamer:
+//         - The MachObjectWriter gets them from the MCAssembler and writes
+//           them in the object file.
+//         - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-collect-loh"
+#include "ARM64.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+static cl::opt<bool>
+PreCollectRegister("arm64-collect-loh-pre-collect-register", cl::Hidden,
+                   cl::desc("Restrict analysis to registers invovled"
+                            " in LOHs"),
+                   cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("arm64-collect-loh-bb-only", cl::Hidden,
+                    cl::desc("Restrict analysis at basic block scope"),
+                    cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+          "Number of simplifiable ADRP dominate by another");
+STATISTIC(NumADRPComplexCandidate2,
+          "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+          "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+          "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+namespace llvm {
+void initializeARM64CollectLOHPass(PassRegistry &);
+}
+
+namespace {
+struct ARM64CollectLOH : public MachineFunctionPass {
+  static char ID;
+  ARM64CollectLOH() : MachineFunctionPass(ID) {
+    initializeARM64CollectLOHPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  virtual const char *getPassName() const {
+    return "ARM64 Collect Linker Optimization Hint (LOH)";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+  }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char ARM64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ARM64CollectLOH, "arm64-collect-loh",
+                      "ARM64 Collect Linker Optimization Hint (LOH)", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh",
+                    "ARM64 Collect Linker Optimization Hint (LOH)", false,
+                    false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+                                 const MachineBasicBlock *MBB, unsigned reg,
+                                 unsigned nbRegs) {
+  SetOfMachineInstr *result;
+  BlockToSetOfInstrsPerColor::iterator it = sets.find(MBB);
+  if (it != sets.end()) {
+    result = it->second;
+  } else {
+    result = sets[MBB] = new SetOfMachineInstr[nbRegs];
+  }
+
+  return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+                                  const MachineInstr *MI) {
+  return sets[reg][MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+                                        const MachineInstr *MI) {
+  InstrToInstrs::const_iterator Res = sets[reg].find(MI);
+  if (Res != sets[reg].end())
+    return &(Res->second);
+  return NULL;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(MachineFunction *MF,
+                            InstrToInstrs *ColorOpToReachedUses,
+                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                            BlockToSetOfInstrsPerColor &ReachableUses,
+                            const MapRegToId &RegToId,
+                            const MachineInstr *DummyOp, bool ADRPMode) {
+  const TargetMachine &TM = MF->getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  unsigned NbReg = RegToId.size();
+
+  for (MachineFunction::const_iterator IMBB = MF->begin(), IMBBEnd = MF->end();
+       IMBB != IMBBEnd; ++IMBB) {
+    const MachineBasicBlock *MBB = &(*IMBB);
+    const MachineInstr **&BBGen = Gen[MBB];
+    BBGen = new const MachineInstr *[NbReg];
+    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+
+    BitVector &BBKillSet = Kill[MBB];
+    BBKillSet.resize(NbReg);
+    for (MachineBasicBlock::const_iterator II = MBB->begin(), IEnd = MBB->end();
+         II != IEnd; ++II) {
+      bool IsADRP = II->getOpcode() == ARM64::ADRP;
+
+      // Process uses first.
+      if (IsADRP || !ADRPMode)
+        for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
+                                              IOEnd = II->operands_end();
+             IO != IOEnd; ++IO) {
+          // Treat ADRP def as use, as the goal of the analysis is to find
+          // ADRP defs reached by other ADRP defs.
+          if (!IO->isReg() || (!ADRPMode && !IO->isUse()) ||
+              (ADRPMode && (!IsADRP || !IO->isDef())))
+            continue;
+          unsigned CurReg = IO->getReg();
+          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+          if (ItCurRegId == RegToId.end())
+            continue;
+          CurReg = ItCurRegId->second;
+
+          // if CurReg has not been defined, this use is reachable.
+          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&(*II));
+          // current basic block definition for this color, if any, is in Gen.
+          if (BBGen[CurReg])
+            getUses(ColorOpToReachedUses, CurReg, BBGen[CurReg]).insert(&(*II));
+        }
+
+      // Process clobbers.
+      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
+                                            IOEnd = II->operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isRegMask())
+          continue;
+        // Clobbers kill the related colors.
+        const uint32_t *PreservedRegs = IO->getRegMask();
+
+        // Set generated regs.
+        for (const auto Entry : RegToId) {
+          unsigned Reg = Entry.second;
+          // Use the global register ID when querying APIs external to this
+          // pass.
+          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+            // Do not register clobbered definition for no ADRP.
+            // This definition is not used anyway (otherwise register
+            // allocation is wrong).
+            BBGen[Reg] = ADRPMode ? II : NULL;
+            BBKillSet.set(Reg);
+          }
+        }
+      }
+
+      // Process defs
+      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
+                                            IOEnd = II->operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isReg() || !IO->isDef())
+          continue;
+        unsigned CurReg = IO->getReg();
+        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+        if (ItCurRegId == RegToId.end())
+          continue;
+
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+          assert(ItRegId != RegToId.end() &&
+                 "Sub-register of an "
+                 "involved register, not recorded as involved!");
+          BBKillSet.set(ItRegId->second);
+          BBGen[ItRegId->second] = &(*II);
+        }
+        BBGen[ItCurRegId->second] = &(*II);
+      }
+    }
+
+    // If we restrict our analysis to basic block scope, conservatively add a
+    // dummy
+    // use for each generated value.
+    if (!ADRPMode && DummyOp && !MBB->succ_empty())
+      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+        if (BBGen[CurReg])
+          getUses(ColorOpToReachedUses, CurReg, BBGen[CurReg]).insert(DummyOp);
+  }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+///    for each bb
+///       for each color
+///           In[bb][color] = U Out[bb.predecessors][color]
+///           insert reachableUses[bb][color] in each in[bb][color]
+///                 op.reachedUses
+///
+///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(MachineFunction *MF,
+                                 InstrToInstrs *ColorOpToReachedUses,
+                                 BlockToSetOfInstrsPerColor &In,
+                                 BlockToSetOfInstrsPerColor &Out,
+                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                                 BlockToSetOfInstrsPerColor &ReachableUses,
+                                 unsigned NbReg) {
+  bool HasChanged;
+  do {
+    HasChanged = false;
+    for (MachineFunction::const_iterator IMBB = MF->begin(),
+                                         IMBBEnd = MF->end();
+         IMBB != IMBBEnd; ++IMBB) {
+      const MachineBasicBlock *MBB = &(*IMBB);
+      unsigned CurReg;
+      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBReachableUses =
+            getSet(ReachableUses, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+        unsigned Size = BBOutSet.size();
+        //   In[bb][color] = U Out[bb.predecessors][color]
+        for (MachineBasicBlock::const_pred_iterator
+                 PredMBB = MBB->pred_begin(),
+                 EndPredMBB = MBB->pred_end();
+             PredMBB != EndPredMBB; ++PredMBB) {
+          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+        }
+        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+        for (const MachineInstr *MI: BBInSet) {
+          SetOfMachineInstr &OpReachedUses =
+              getUses(ColorOpToReachedUses, CurReg, MI);
+          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+        }
+        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+        if (!Kill[MBB].test(CurReg))
+          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+        if (Gen[MBB][CurReg])
+          BBOutSet.insert(Gen[MBB][CurReg]);
+        HasChanged |= BBOutSet.size() != Size;
+      }
+    }
+  } while (HasChanged);
+}
+
+/// Release all memory dynamically allocated during the reaching
+/// definition algorithm.
+static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
+                             BlockToSetOfInstrsPerColor &Out,
+                             BlockToInstrPerColor &Gen,
+                             BlockToSetOfInstrsPerColor &ReachableUses) {
+  for (BlockToSetOfInstrsPerColor::const_iterator IT = Out.begin(),
+                                                  End = Out.end();
+       IT != End; ++IT)
+    delete[] IT->second;
+  for (BlockToSetOfInstrsPerColor::const_iterator IT = In.begin(),
+                                                  End = In.end();
+       IT != End; ++IT)
+    delete[] IT->second;
+  for (BlockToSetOfInstrsPerColor::const_iterator IT = ReachableUses.begin(),
+                                                  End = ReachableUses.end();
+       IT != End; ++IT)
+    delete[] IT->second;
+  for (BlockToInstrPerColor::const_iterator IT = Gen.begin(), End = Gen.end();
+       IT != End; ++IT)
+    delete[] IT->second;
+}
+
+/// Reaching definiton algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed defintion a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(MachineFunction *MF,
+                        InstrToInstrs *ColorOpToReachedUses,
+                        const MapRegToId &RegToId, bool ADRPMode = false,
+                        const MachineInstr *DummyOp = NULL) {
+  // structures:
+  // For each basic block.
+  // Out: a set per color of definitions that reach the
+  //      out boundary of this block.
+  // In: Same as Out but for in boundary.
+  // Gen: generated color in this block (one operation per color).
+  // Kill: register set of killed color in this block.
+  // ReachableUses: a set per color of uses (operation) reachable
+  //                for "In" definitions.
+  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+  BlockToInstrPerColor Gen;
+  BlockToRegSet Kill;
+
+  // Initialize Gen, kill and reachableUses.
+  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+                  DummyOp, ADRPMode);
+
+  // Algo.
+  if (!DummyOp)
+    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+                         ReachableUses, RegToId.size());
+
+  // finit.
+  finitReachingDef(In, Out, Gen, ReachableUses);
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+                             unsigned NbReg, const TargetRegisterInfo *TRI,
+                             const MapIdToReg &IdToReg) {
+  unsigned CurReg;
+  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+    InstrToInstrs::const_iterator DefsIt = ColorOpToReachedUses[CurReg].begin();
+    InstrToInstrs::const_iterator DefsItEnd =
+        ColorOpToReachedUses[CurReg].end();
+    for (; DefsIt != DefsItEnd; ++DefsIt) {
+      DEBUG(dbgs() << "Def:\n");
+      DEBUG(DefsIt->first->print(dbgs()));
+      DEBUG(dbgs() << "Reachable uses:\n");
+      for (SetOfMachineInstr::const_iterator UsesIt = DefsIt->second.begin(),
+                                             UsesItEnd = DefsIt->second.end();
+           UsesIt != UsesItEnd; ++UsesIt) {
+        DEBUG((*UsesIt)->print(dbgs()));
+      }
+    }
+  }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+  unsigned Opc = Def->getOpcode();
+  // Accept ADRP, ADDLow and LOADGot.
+  switch (Opc) {
+  default:
+    return false;
+  case ARM64::ADRP:
+    return true;
+  case ARM64::ADDXri:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return true;
+    }
+  case ARM64::LDRXui:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      return true;
+    }
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM64::STRBui:
+  case ARM64::STRHui:
+  case ARM64::STRWui:
+  case ARM64::STRXui:
+  case ARM64::STRSui:
+  case ARM64::STRDui:
+  case ARM64::STRQui:
+    // In case we have str xA, [xA, #imm], this is two different uses
+    // of xA and we cannot fold, otherwise the xA stored may be wrong,
+    // even if #imm == 0.
+    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+      return true;
+  }
+  return false;
+}
+
+/// Given the result of a reaching defintion algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+                              const InstrToInstrs *ColorOpToReachedUses,
+                              const MapRegToId &RegToId,
+                              bool ADRPMode = false) {
+
+  SetOfMachineInstr NotCandidate;
+  unsigned NbReg = RegToId.size();
+  MapRegToId::const_iterator EndIt = RegToId.end();
+  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+    // If this color is never defined, continue.
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+
+    InstrToInstrs::const_iterator DefsIt = ColorOpToReachedUses[CurReg].begin();
+    InstrToInstrs::const_iterator DefsItEnd =
+        ColorOpToReachedUses[CurReg].end();
+    for (; DefsIt != DefsItEnd; ++DefsIt) {
+      for (SetOfMachineInstr::const_iterator UsesIt = DefsIt->second.begin(),
+                                             UsesItEnd = DefsIt->second.end();
+           UsesIt != UsesItEnd; ++UsesIt) {
+        const MachineInstr *Def = DefsIt->first;
+        MapRegToId::const_iterator It;
+        // if all the reaching defs are not adrp, this use will not be
+        // simplifiable.
+        if ((ADRPMode && Def->getOpcode() != ARM64::ADRP) ||
+            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+            (!ADRPMode && isCandidateStore(*UsesIt) &&
+             // store are LOH candidate iff the end of the chain is used as
+             // base.
+             ((It = RegToId.find((*UsesIt)->getOperand(1).getReg())) == EndIt ||
+              It->second != CurReg))) {
+          NotCandidate.insert(*UsesIt);
+          continue;
+        }
+        // Do not consider self reaching as a simplifiable case for ADRP.
+        if (!ADRPMode || *UsesIt != DefsIt->first) {
+          UseToReachingDefs[*UsesIt].insert(DefsIt->first);
+          // If UsesIt has several reaching definitions, it is not
+          // candidate for simplificaton in non-ADRPMode.
+          if (!ADRPMode && UseToReachingDefs[*UsesIt].size() > 1)
+            NotCandidate.insert(*UsesIt);
+        }
+      }
+    }
+  }
+  for (const MachineInstr *Elem : NotCandidate) {
+    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+    // It would have been better if we could just remove the entry
+    // from the map.  Because of that, we have to filter the garbage
+    // (second.empty) in the subsequence analysis.
+    UseToReachingDefs[Elem].clear();
+  }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+                        ARM64FunctionInfo &ARM64FI,
+                        const MachineDominatorTree *MDT) {
+  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+  for (const auto &Entry: UseToDefs) {
+    unsigned Size = Entry.second.size();
+    if (Size == 0)
+      continue;
+    if (Size == 1) {
+      const MachineInstr *L2 = *Entry.second.begin();
+      const MachineInstr *L1 = Entry.first;
+      if (!MDT->dominates(L2, L1)) {
+        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+                     << '\n');
+        continue;
+      }
+      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+      SmallVector<const MachineInstr *, 2> Args;
+      Args.push_back(L2);
+      Args.push_back(L1);
+      ARM64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      ++NumADRPSimpleCandidate;
+    }
+#ifdef DEBUG
+    else if (Size == 2)
+      ++NumADRPComplexCandidate2;
+    else if (Size == 3)
+      ++NumADRPComplexCandidate3;
+    else
+      ++NumADRPComplexCandidateOther;
+#endif
+    // if Size < 1, the use should have been removed from the candidates
+    assert(Size >= 1 && "No reaching defs for that use!");
+  }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM64::LDRSBWui:
+  case ARM64::LDRSBXui:
+  case ARM64::LDRSHWui:
+  case ARM64::LDRSHXui:
+  case ARM64::LDRSWui:
+  case ARM64::LDRBui:
+  case ARM64::LDRHui:
+  case ARM64::LDRWui:
+  case ARM64::LDRXui:
+  case ARM64::LDRSui:
+  case ARM64::LDRDui:
+  case ARM64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)
+      return false;
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM64::LDRSWui:
+  case ARM64::LDRWui:
+  case ARM64::LDRXui:
+  case ARM64::LDRSui:
+  case ARM64::LDRDui:
+  case ARM64::LDRQui:
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+                        const InstrToInstrs &UseToDefs,
+                        const MachineDominatorTree *MDT) {
+  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+    return false;
+
+  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+  if (Def->getOpcode() != ARM64::ADRP) {
+    // At this point, Def is ADDXri or LDRXui of the right type of
+    // symbol, because we filtered out the uses that were not defined
+    // by these kind of instructions (+ ADRP).
+
+    // Check if this forms a simple chain: each intermediate node must
+    // dominates the next one.
+    if (!MDT->dominates(Def, Instr))
+      return false;
+    // Move one node up in the simple chain.
+    if (UseToDefs.find(Def) == UseToDefs.end()
+                               // The map may contain garbage we have to ignore.
+        ||
+        UseToDefs.find(Def)->second.empty())
+      return false;
+    Instr = Def;
+    Def = *UseToDefs.find(Def)->second.begin();
+  }
+  // Check if we reached the top of the simple chain:
+  // - top is ADRP.
+  // - check the simple chain property: each intermediate node must
+  // dominates the next one.
+  if (Def->getOpcode() == ARM64::ADRP)
+    return MDT->dominates(Def, Instr);
+  return false;
+}
+
+static bool registerADRCandidate(const MachineInstr *Use,
+                                 const InstrToInstrs &UseToDefs,
+                                 const InstrToInstrs *DefsPerColorToUses,
+                                 ARM64FunctionInfo &ARM64FI,
+                                 SetOfMachineInstr *InvolvedInLOHs,
+                                 const MapRegToId &RegToId) {
+  // Look for opportunities to turn ADRP -> ADD or
+  // ADRP -> LDR GOTPAGEOFF into ADR.
+  // If ADRP has more than one use. Give up.
+  if (Use->getOpcode() != ARM64::ADDXri &&
+      (Use->getOpcode() != ARM64::LDRXui ||
+       !(Use->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)))
+    return false;
+  InstrToInstrs::const_iterator It = UseToDefs.find(Use);
+  // The map may contain garbage that we need to ignore.
+  if (It == UseToDefs.end() || It->second.empty())
+    return false;
+  const MachineInstr *Def = *It->second.begin();
+  if (Def->getOpcode() != ARM64::ADRP)
+    return false;
+  // Check the number of users of ADRP.
+  const SetOfMachineInstr *Users =
+      getUses(DefsPerColorToUses,
+              RegToId.find(Def->getOperand(0).getReg())->second, Def);
+  if (Users->size() > 1) {
+    ++NumADRComplexCandidate;
+    return false;
+  }
+  ++NumADRSimpleCandidate;
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Def)) &&
+         "ADRP already involved in LOH.");
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Use)) &&
+         "ADD already involved in LOH.");
+  DEBUG(dbgs() << "Record AdrpAdd\n" << *Def << '\n' << *Use << '\n');
+
+  SmallVector<const MachineInstr *, 2> Args;
+  Args.push_back(Def);
+  Args.push_back(Use);
+
+  ARM64FI.addLOHDirective(Use->getOpcode() == ARM64::ADDXri ? MCLOH_AdrpAdd
+                                                            : MCLOH_AdrpLdrGot,
+                          Args);
+  return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+                          const InstrToInstrs *DefsPerColorToUses,
+                          ARM64FunctionInfo &ARM64FI, const MapRegToId &RegToId,
+                          const MachineDominatorTree *MDT) {
+  SetOfMachineInstr *InvolvedInLOHs = NULL;
+#ifdef DEBUG
+  SetOfMachineInstr InvolvedInLOHsStorage;
+  InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // DEBUG
+  DEBUG(dbgs() << "*** Compute LOH for Others\n");
+  // ADRP -> ADD/LDR -> LDR/STR pattern.
+  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+  // FIXME: When the statistics are not important,
+  // This initial filtering loop can be merged into the next loop.
+  // Currently, we didn't do it to have the same code for both DEBUG and
+  // NDEBUG builds. Indeed, the iterator of the second loop would need
+  // to be changed.
+  SetOfMachineInstr PotentialCandidates;
+  SetOfMachineInstr PotentialADROpportunities;
+  for (InstrToInstrs::const_iterator UseIt = UseToDefs.begin(),
+                                     EndUseIt = UseToDefs.end();
+       UseIt != EndUseIt; ++UseIt) {
+    // If no definition is available, this is a non candidate.
+    if (UseIt->second.empty())
+      continue;
+    // Keep only instructions that are load or store and at the end of
+    // a ADRP -> ADD/LDR/Nothing chain.
+    // We already filtered out the no-chain cases.
+    if (!isCandidate(UseIt->first, UseToDefs, MDT)) {
+      PotentialADROpportunities.insert(UseIt->first);
+      continue;
+    }
+    PotentialCandidates.insert(UseIt->first);
+  }
+
+  // Make the following distinctions for statistics as the linker does
+  // know how to decode instructions:
+  // - ADD/LDR/Nothing make there different patterns.
+  // - LDR/STR make two different patterns.
+  // Hence, 6 - 1 base patterns.
+  // (because ADRP-> Nothing -> STR is not simplifiable)
+
+  // The linker is only able to have a simple semantic, i.e., if pattern A
+  // do B.
+  // However, we want to see the opportunity we may miss if we were able to
+  // catch more complex cases.
+
+  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+  // A potential candidate becomes a candidate, if its current immediate
+  // operand is zero and all nodes of the chain have respectively only one user
+  SetOfMachineInstr::const_iterator CandidateIt, EndCandidateIt;
+#ifdef DEBUG
+  SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+  for (CandidateIt = PotentialCandidates.begin(),
+      EndCandidateIt = PotentialCandidates.end();
+       CandidateIt != EndCandidateIt; ++CandidateIt) {
+    const MachineInstr *Candidate = *CandidateIt;
+    // Get the definition of the candidate i.e., ADD or LDR.
+    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+    // Record the elements of the chain.
+    const MachineInstr *L1 = Def;
+    const MachineInstr *L2 = NULL;
+    unsigned ImmediateDefOpc = Def->getOpcode();
+    if (Def->getOpcode() != ARM64::ADRP) {
+      // Check the number of users of this node.
+      const SetOfMachineInstr *Users =
+          getUses(DefsPerColorToUses,
+                  RegToId.find(Def->getOperand(0).getReg())->second, Def);
+      if (Users->size() > 1) {
+#ifdef DEBUG
+        // if all the uses of this def are in potential candidate, this is
+        // a complex candidate of level 2.
+        SetOfMachineInstr::const_iterator UseIt = Users->begin();
+        SetOfMachineInstr::const_iterator EndUseIt = Users->end();
+        for (; UseIt != EndUseIt; ++UseIt) {
+          if (!PotentialCandidates.count(*UseIt)) {
+            ++NumTooCplxLvl2;
+            break;
+          }
+        }
+        if (UseIt == EndUseIt)
+          ++NumCplxLvl2;
+#endif // DEBUG
+        PotentialADROpportunities.insert(Def);
+        continue;
+      }
+      L2 = Def;
+      Def = *UseToDefs.find(Def)->second.begin();
+      L1 = Def;
+    } // else the element in the middle of the chain is nothing, thus
+      // Def already contains the first element of the chain.
+
+    // Check the number of users of the first node in the chain, i.e., ADRP
+    const SetOfMachineInstr *Users =
+        getUses(DefsPerColorToUses,
+                RegToId.find(Def->getOperand(0).getReg())->second, Def);
+    if (Users->size() > 1) {
+#ifdef DEBUG
+      // if all the uses of this def are in the defs of the potential candidate,
+      // this is a complex candidate of level 1
+      if (DefsOfPotentialCandidates.empty()) {
+        // lazy init
+        DefsOfPotentialCandidates = PotentialCandidates;
+        for (const MachineInstr *Candidate : PotentialCandidates) {
+          if (!UseToDefs.find(Candidate)->second.empty())
+            DefsOfPotentialCandidates.insert(
+                *UseToDefs.find(Candidate)->second.begin());
+        }
+      }
+      bool Found = false;
+      for (auto &Use: *Users) {
+        if (!DefsOfPotentialCandidates.count(Use)) {
+          ++NumTooCplxLvl1;
+          Found = true;
+          break;
+        }
+      }
+      if (!Found)
+        ++NumCplxLvl1;
+#endif // DEBUG
+      continue;
+    }
+
+    bool IsL2Add = (ImmediateDefOpc == ARM64::ADDXri);
+    // If the chain is three instructions long and ldr is the second element,
+    // then this ldr must load form GOT, otherwise this is not a correct chain.
+    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != ARM64II::MO_GOT)
+      continue;
+    SmallVector<const MachineInstr *, 3> Args;
+    MCLOHType Kind;
+    if (isCandidateLoad(Candidate)) {
+      if (L2 == NULL) {
+        // At this point, the candidate LOH indicates that the ldr instruction
+        // may use a direct access to the symbol. There is not such encoding
+        // for loads of byte and half.
+        if (!supportLoadFromLiteral(Candidate))
+          continue;
+
+        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+                     << '\n');
+        Kind = MCLOH_AdrpLdr;
+        Args.push_back(L1);
+        Args.push_back(Candidate);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+        ++NumADRPToLDR;
+      } else {
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the load
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == ARM64::ADDXri)
+            ++NumADDToLDR;
+          else
+            ++NumLDRToLDR;
+        else if (ImmediateDefOpc == ARM64::ADDXri)
+          ++NumADDToLDRWithImm;
+        else
+          ++NumLDRToLDRWithImm;
+#endif // DEBUG
+      }
+    } else {
+      if (ImmediateDefOpc == ARM64::ADRP)
+        continue;
+      else {
+
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the store
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == ARM64::ADDXri)
+            ++NumADDToSTR;
+          else
+            ++NumLDRToSTR;
+        else if (ImmediateDefOpc == ARM64::ADDXri)
+          ++NumADDToSTRWithImm;
+        else
+          ++NumLDRToSTRWithImm;
+#endif // DEBUG
+      }
+    }
+    ARM64FI.addLOHDirective(Kind, Args);
+  }
+
+  // Now, we grabbed all the big patterns, check ADR opportunities.
+  for (const MachineInstr *Candidate: PotentialADROpportunities)
+    registerADRCandidate(Candidate, UseToDefs, DefsPerColorToUses, ARM64FI,
+                         InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
+                               MapIdToReg &IdToReg,
+                               const TargetRegisterInfo *TRI) {
+  unsigned CurRegId = 0;
+  if (!PreCollectRegister) {
+    unsigned NbReg = TRI->getNumRegs();
+    for (; CurRegId < NbReg; ++CurRegId) {
+      RegToId[CurRegId] = CurRegId;
+      DEBUG(IdToReg.push_back(CurRegId));
+      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+    }
+    return;
+  }
+
+  DEBUG(dbgs() << "** Collect Involved Register\n");
+  for (MachineFunction::const_iterator IMBB = MF.begin(), IMBBEnd = MF.end();
+       IMBB != IMBBEnd; ++IMBB)
+    for (MachineBasicBlock::const_iterator II = IMBB->begin(),
+                                           IEnd = IMBB->end();
+         II != IEnd; ++II) {
+
+      if (!canDefBePartOfLOH(II))
+        continue;
+
+      // Process defs
+      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
+                                            IOEnd = II->operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isReg() || !IO->isDef())
+          continue;
+        unsigned CurReg = IO->getReg();
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+          if (RegToId.find(*AI) == RegToId.end()) {
+            DEBUG(IdToReg.push_back(*AI);
+                  assert(IdToReg[CurRegId] == *AI &&
+                         "Reg index mismatches insertion index."));
+            RegToId[*AI] = CurRegId++;
+            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+          }
+      }
+    }
+}
+
+bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+  MapRegToId RegToId;
+  MapIdToReg IdToReg;
+  ARM64FunctionInfo *ARM64FI = Fn.getInfo<ARM64FunctionInfo>();
+  assert(ARM64FI && "No MachineFunctionInfo for this function!");
+
+  DEBUG(dbgs() << "Looking for LOH in " << Fn.getName() << '\n');
+
+  collectInvolvedReg(Fn, RegToId, IdToReg, TRI);
+  if (RegToId.empty())
+    return false;
+
+  MachineInstr *DummyOp = NULL;
+  if (BasicBlockScopeOnly) {
+    const ARM64InstrInfo *TII =
+        static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+    // For local analysis, create a dummy operation to record uses that are not
+    // local.
+    DummyOp = Fn.CreateMachineInstr(TII->get(ARM64::COPY), DebugLoc());
+  }
+
+  unsigned NbReg = RegToId.size();
+  bool Modified = false;
+
+  // Start with ADRP.
+  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // Compute the reaching def in ADRP mode, meaning ADRP definitions
+  // are first considered as uses.
+  reachingDef(&Fn, ColorOpToReachedUses, RegToId, true, DummyOp);
+  DEBUG(dbgs() << "ADRP reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Translate the definition to uses map into a use to definitions map to ease
+  // statistic computation.
+  InstrToInstrs ADRPToReachingDefs;
+  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+  // Compute LOH for ADRP.
+  computeADRP(ADRPToReachingDefs, *ARM64FI, MDT);
+  delete[] ColorOpToReachedUses;
+
+  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+  ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // first perform a regular reaching def analysis.
+  reachingDef(&Fn, ColorOpToReachedUses, RegToId, false, DummyOp);
+  DEBUG(dbgs() << "All reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Turn that into a use to defs to ease statistic computation.
+  InstrToInstrs UsesToReachingDefs;
+  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+  // Compute other than AdrpAdrp LOH.
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *ARM64FI, RegToId,
+                MDT);
+  delete[] ColorOpToReachedUses;
+
+  if (BasicBlockScopeOnly)
+    Fn.DeleteMachineInstr(DummyOp);
+
+  return Modified;
+}
+
+/// createARM64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createARM64CollectLOHPass() {
+  return new ARM64CollectLOH();
+}
diff --git a/lib/Target/ARM64/ARM64ConditionalCompares.cpp b/lib/Target/ARM64/ARM64ConditionalCompares.cpp
new file mode 100644
index 0000000..b495afa
--- /dev/null
+++ b/lib/Target/ARM64/ARM64ConditionalCompares.cpp
@@ -0,0 +1,918 @@
+//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-ccmp"
+#include "ARM64.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "arm64-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("arm64-stress-ccmp", cl::Hidden,
+                            cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultCPSRUses, "Number of ccmps rejected (CPSR used)");
+STATISTIC(NumUnknCPSRDefs, "Number of ccmps rejected (CPSR def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The ARM64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+//    if (a == 5 || b == 17)
+//      foo();
+//
+//    Head:
+//       cmp  w0, #5
+//       b.eq Tail
+//    CmpBB:
+//       cmp  w1, #17
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+//  Becomes:
+//
+//    Head:
+//       cmp  w0, #5
+//       ccmp w1, #17, 4, ne  ; 4 = nZcv
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a sucessor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  ARM64CC::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  ARM64CC::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    this->MF = &MF;
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+  /// Return the expected code size delta if the conversion into a
+  /// conditional compare is performed.
+  int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I->getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I->getOperand(oi + 1).getMBB();
+      unsigned Reg = I->getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned oi = I->getNumOperands(); oi > 2; oi -= 2) {
+      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+      if (I->getOperand(oi - 1).getMBB() == CmpBB) {
+        I->RemoveOperand(oi - 1);
+        I->RemoveOperand(oi - 2);
+      }
+    }
+  }
+}
+
+// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are
+// still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  // Writes to the zero register are dead.
+  if (DstReg == ARM64::WZR || DstReg == ARM64::XZR)
+    return true;
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  // For tbz and cbz instruction, the opcode is next.
+  switch (Cond[1].getImm()) {
+  default:
+    // This includes tbz / tbnz branches which can't be converted to
+    // ccmp + br.cond.
+    return false;
+  case ARM64::CBZW:
+  case ARM64::CBZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = ARM64CC::EQ;
+    return true;
+  case ARM64::CBNZW:
+  case ARM64::CBNZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = ARM64CC::NE;
+    return true;
+  }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return 0;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(ARM64::CPSR)) {
+    switch (I->getOpcode()) {
+    case ARM64::CBZW:
+    case ARM64::CBZX:
+    case ARM64::CBNZW:
+    case ARM64::CBNZX:
+      // These can be converted into a ccmp against #0.
+      return I;
+    }
+    ++NumCmpTermRejs;
+    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return 0;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case ARM64::SUBSWri:
+    case ARM64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case ARM64::ADDSWri:
+    case ARM64::ADDSXri:
+      // Check that the immediate operand is within range, ccmp wants a uimm5.
+      // Rd = SUBSri Rn, imm, shift
+      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        ++NumImmRangeRejs;
+        return 0;
+      }
+    // Fall through.
+    case ARM64::SUBSWrr:
+    case ARM64::SUBSXrr:
+    case ARM64::ADDSWrr:
+    case ARM64::ADDSXrr:
+      if (isDeadDef(I->getOperand(0).getReg()))
+        return I;
+      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      ++NumLiveDstRejs;
+      return 0;
+    case ARM64::FCMPSrr:
+    case ARM64::FCMPDrr:
+    case ARM64::FCMPESrr:
+    case ARM64::FCMPEDrr:
+      return I;
+    }
+
+    // Check for flag reads and clobbers.
+    MIOperands::PhysRegInfo PRI =
+        MIOperands(I).analyzePhysReg(ARM64::CPSR, TRI);
+
+    if (PRI.Reads) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultCPSRUses;
+      return 0;
+    }
+
+    if (PRI.Clobbers) {
+      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknCPSRDefs;
+      return 0;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return 0;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Only CmpMI is alowed to clobber the flags.
+    if (&*I != CmpMI && I->modifiesRegister(ARM64::CPSR, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << *I);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = 0;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = 0;
+  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC);
+
+  DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC)
+               << '\n');
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+  return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+               << Head->getNumber() << ":\n" << *CmpBB);
+
+  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+  // Update the CFG first.
+  updateTailPHIs();
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place.
+  if (HeadCond[0].getImm() == -1) {
+    ++NumCompBranches;
+    unsigned Opc = 0;
+    switch (HeadCond[1].getImm()) {
+    case ARM64::CBZW:
+    case ARM64::CBNZW:
+      Opc = ARM64::SUBSWri;
+      break;
+    case ARM64::CBZX:
+    case ARM64::CBNZX:
+      Opc = ARM64::SUBSXri;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+    const MCInstrDesc &MCID = TII->get(Opc);
+    // Create a dummy virtual register for the SUBS def.
+    unsigned DestReg =
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+    BuildMI(*Head, Head->end(), TermDL, MCID)
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addOperand(HeadCond[2])
+        .addImm(0)
+        .addImm(0);
+    // SUBS uses the GPR*sp register classes.
+    MRI->constrainRegClass(HeadCond[2].getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  }
+
+  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+  // Now replace CmpMI with a ccmp instruction that also considers the incoming
+  // flags.
+  unsigned Opc = 0;
+  unsigned FirstOp = 1;   // First CmpMI operand to copy.
+  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown compare opcode");
+  case ARM64::SUBSWri:    Opc = ARM64::CCMPWi; break;
+  case ARM64::SUBSWrr:    Opc = ARM64::CCMPWr; break;
+  case ARM64::SUBSXri:    Opc = ARM64::CCMPXi; break;
+  case ARM64::SUBSXrr:    Opc = ARM64::CCMPXr; break;
+  case ARM64::ADDSWri:    Opc = ARM64::CCMNWi; break;
+  case ARM64::ADDSWrr:    Opc = ARM64::CCMNWr; break;
+  case ARM64::ADDSXri:    Opc = ARM64::CCMNXi; break;
+  case ARM64::ADDSXrr:    Opc = ARM64::CCMNXr; break;
+  case ARM64::FCMPSrr:    Opc = ARM64::FCCMPSrr; FirstOp = 0; break;
+  case ARM64::FCMPDrr:    Opc = ARM64::FCCMPDrr; FirstOp = 0; break;
+  case ARM64::FCMPESrr:   Opc = ARM64::FCCMPESrr; FirstOp = 0; break;
+  case ARM64::FCMPEDrr:   Opc = ARM64::FCCMPEDrr; FirstOp = 0; break;
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+    Opc = ARM64::CCMPWi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+    Opc = ARM64::CCMPXi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  }
+
+  // The ccmp instruction should set the flags according to the comparison when
+  // Head would have branched to CmpBB.
+  // The NZCV immediate operand should provide flags for the case where Head
+  // would have branched to Tail. These flags should cause the new Head
+  // terminator to branch to tail.
+  unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  const MCInstrDesc &MCID = TII->get(Opc);
+  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+                         TII->getRegClass(MCID, 0, TRI, *MF));
+  if (CmpMI->getOperand(FirstOp + 1).isReg())
+    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  MachineInstrBuilder MIB =
+      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  if (isZBranch)
+    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+  else
+    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+  // If CmpMI was a terminator, we need a new conditional branch to replace it.
+  // This now becomes a Head terminator.
+  if (isZBranch) {
+    bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW ||
+                CmpMI->getOpcode() == ARM64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc))
+        .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ)
+        .addOperand(CmpMI->getOperand(1)); // Branch target.
+  }
+  CmpMI->eraseFromParent();
+  Head->updateTerminator();
+
+  RemovedBlocks.push_back(CmpBB);
+  CmpBB->eraseFromParent();
+  DEBUG(dbgs() << "Result:\n" << *Head);
+  ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+  int delta = 0;
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place
+  // plus a branch instruction.
+  if (HeadCond[0].getImm() == -1) {
+    switch (HeadCond[1].getImm()) {
+    case ARM64::CBZW:
+    case ARM64::CBNZW:
+    case ARM64::CBZX:
+    case ARM64::CBNZX:
+      // Therefore delta += 1
+      delta = 1;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+  }
+  // If the Cmp terminator was one of the cbz / tbz branches with
+  // built-in compare, it will be turned into a compare instruction
+  // into Head, but we do not save any instruction.
+  // Otherwise, we save the branch instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    --delta;
+    break;
+  case ARM64::CBZW:
+  case ARM64::CBNZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZX:
+    break;
+  }
+  return delta;
+}
+
+//===----------------------------------------------------------------------===//
+//                       ARM64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ARM64ConditionalCompares : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  // Does the proceeded function has Oz attribute.
+  bool MinSize;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSACCmpConv CmpConv;
+
+public:
+  static char ID;
+  ARM64ConditionalCompares() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+  const char *getPassName() const { return "ARM64 Conditional Compares"; }
+
+private:
+  bool tryConvert(MachineBasicBlock *);
+  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+  void invalidateTraces();
+  bool shouldConvert();
+};
+} // end anonymous namespace
+
+char ARM64ConditionalCompares::ID = 0;
+
+namespace llvm {
+void initializeARM64ConditionalComparesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
+                    false, false)
+
+FunctionPass *llvm::createARM64ConditionalCompares() {
+  return new ARM64ConditionalCompares();
+}
+
+void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void
+ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
+  // convert() removes CmpBB which was previously dominated by Head.
+  // CmpBB children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+    while (Node->getNumChildren())
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void ARM64ConditionalCompares::invalidateTraces() {
+  Traces->invalidate(CmpConv.Head);
+  Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool ARM64ConditionalCompares::shouldConvert() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  // Head dominates CmpBB, so it is always included in its trace.
+  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+  // If code size is the main concern
+  if (MinSize) {
+    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    // If we are minimizing the code size, do the conversion whatever
+    // the cost is.
+    if (CodeSizeDelta < 0)
+      return true;
+    if (CodeSizeDelta > 0) {
+      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      return false;
+    }
+    // CodeSizeDelta == 0, continue with the regular heuristics
+  }
+
+  // Heuristic: The compare conversion delays the execution of the branch
+  // instruction because we must wait for the inputs to the second compare as
+  // well. The branch has no dependent instructions, but delaying it increases
+  // the cost of a misprediction.
+  //
+  // Set a limit on the delay we will accept.
+  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+
+  // Instruction depths can be computed for all trace instructions above CmpBB.
+  unsigned HeadDepth =
+      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+  unsigned CmpBBDepth =
+      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Head depth:  " << HeadDepth
+               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  if (CmpBBDepth > HeadDepth + DelayLimit) {
+    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                 << " cycles.\n");
+    return false;
+  }
+
+  // Check the resource depth at the bottom of CmpBB - these instructions will
+  // be speculated.
+  unsigned ResDepth = Trace.getResourceDepth(true);
+  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+
+  // Heuristic: The speculatively executed instructions must all be able to
+  // merge into the Head block. The Head critical path should dominate the
+  // resource cost of the speculated instructions.
+  if (ResDepth > HeadDepth) {
+    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    return false;
+  }
+  return true;
+}
+
+bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (CmpConv.canConvert(MBB) && shouldConvert()) {
+    invalidateTraces();
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    CmpConv.convert(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+  MinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  bool Changed = false;
+  CmpConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that, it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (df_iterator<MachineDominatorTree *> I = df_begin(DomTree),
+                                           E = df_end(DomTree);
+       I != E; ++I)
+    if (tryConvert(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 0000000..3e410e5
--- /dev/null
+++ b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,104 @@
+//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-dead-defs"
+#include "ARM64.h"
+#include "ARM64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F);
+
+  const char *getPassName() const { return "Dead register definitions"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char ARM64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool
+ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    MachineInstr *MI = I;
+    for (int i = 0, e = MI->getDesc().getNumDefs(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead() && MO.isDef()) {
+        assert(!MO.isImplicit() && "Unexpected implicit def!");
+        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
+              MI->print(dbgs()));
+        // Be careful not to change the register if it's a tied operand.
+        if (MI->isRegTiedToUseOperand(i)) {
+          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+          continue;
+        }
+        // Make sure the instruction take a register class that contains
+        // the zero register and replace it if so.
+        unsigned NewReg;
+        switch (MI->getDesc().OpInfo[i].RegClass) {
+        default:
+          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+          continue;
+        case ARM64::GPR32RegClassID:
+          NewReg = ARM64::WZR;
+          break;
+        case ARM64::GPR64RegClassID:
+          NewReg = ARM64::XZR;
+          break;
+        }
+        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+        MO.setReg(NewReg);
+        DEBUG(MI->print(dbgs()));
+        ++NumDeadDefsReplaced;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &mf) {
+  MachineFunction *MF = &mf;
+  bool Changed = false;
+  DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n");
+
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+FunctionPass *llvm::createARM64DeadRegisterDefinitions() {
+  return new ARM64DeadRegisterDefinitions();
+}
diff --git a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp b/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
new file mode 100644
index 0000000..e082baf
--- /dev/null
+++ b/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
@@ -0,0 +1,737 @@
+//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "ARM64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+class ARM64ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  ARM64ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  const ARM64InstrInfo *TII;
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  virtual const char *getPassName() const {
+    return "ARM64 pseudo instruction expansion pass";
+  }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned BitSize);
+};
+char ARM64ExpandPseudo::ID = 0;
+}
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ToIdx * 16;
+
+  // Replicate the source chunk to the destination position.
+  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+  // Clear the destination chunk.
+  Imm &= ~(0xFFFFLL << ShiftAmt);
+  // Insert the replicated chunk.
+  return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+                       MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator &MBBI,
+                       const ARM64InstrInfo *TII, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ChunkIdx * 16;
+
+  uint64_t Encoding;
+  if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(ARM64::XZR)
+            .addImm(Encoding);
+
+    // Create the MOVK instruction.
+    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+  return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 const ARM64InstrInfo *TII) {
+  typedef DenseMap<uint64_t, unsigned> CountMap;
+  CountMap Counts;
+
+  // Scan the constant and count how often every chunk occurs.
+  for (unsigned Idx = 0; Idx < 4; ++Idx)
+    ++Counts[getChunk(UImm, Idx)];
+
+  // Traverse the chunks to find one which occurs more than once.
+  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+       Chunk != End; ++Chunk) {
+    const uint64_t ChunkVal = Chunk->first;
+    const unsigned Count = Chunk->second;
+
+    uint64_t Encoding = 0;
+
+    // We are looking for chunks which have two or three instances and can be
+    // materialized with an ORR instruction.
+    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+      continue;
+
+    const bool CountThree = Count == 3;
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(ARM64::XZR)
+            .addImm(Encoding);
+
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+
+    unsigned ShiftAmt = 0;
+    uint64_t Imm16 = 0;
+    // Find the first chunk not materialized with the ORR instruction.
+    for (; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the first MOVK instruction.
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+            .addReg(DstReg,
+                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+
+    // In case we have three instances the whole constant is now materialized
+    // and we can exit.
+    if (CountThree) {
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // Find the remaining chunk which needs to be materialized.
+    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the second MOVK instruction.
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+  const uint64_t Mask = 0xFFFF;
+
+  if (Clear)
+    // Clear chunk in the immediate.
+    Imm &= ~(Mask << (Idx * 16));
+  else
+    // Set all bits in the immediate for the particular chunk.
+    Imm |= Mask << (Idx * 16);
+
+  return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              const ARM64InstrInfo *TII) {
+  const int NotSet = -1;
+  const uint64_t Mask = 0xFFFF;
+
+  int StartIdx = NotSet;
+  int EndIdx = NotSet;
+  // Try to find the chunks which start/end a contiguous sequence of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    int64_t Chunk = getChunk(UImm, Idx);
+    // Sign extend the 16-bit chunk to 64-bit.
+    Chunk = (Chunk << 48) >> 48;
+
+    if (isStartChunk(Chunk))
+      StartIdx = Idx;
+    else if (isEndChunk(Chunk))
+      EndIdx = Idx;
+  }
+
+  // Early exit in case we can't find a start/end chunk.
+  if (StartIdx == NotSet || EndIdx == NotSet)
+    return false;
+
+  // Outside of the contiguous sequence of ones everything needs to be zero.
+  uint64_t Outside = 0;
+  // Chunks between the start and end chunk need to have all their bits set.
+  uint64_t Inside = Mask;
+
+  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+  // just swap indices and pretend we are materializing a contiguous sequence
+  // of zeros surrounded by a contiguous sequence of ones.
+  if (StartIdx > EndIdx) {
+    std::swap(StartIdx, EndIdx);
+    std::swap(Outside, Inside);
+  }
+
+  uint64_t OrrImm = UImm;
+  int FirstMovkIdx = NotSet;
+  int SecondMovkIdx = NotSet;
+
+  // Find out which chunks we need to patch up to obtain a contiguous sequence
+  // of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    const uint64_t Chunk = getChunk(UImm, Idx);
+
+    // Check whether we are looking at a chunk which is not part of the
+    // contiguous sequence of ones.
+    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+
+      // Check whether we are looking a chunk which is part of the contiguous
+      // sequence of ones.
+    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+    }
+  }
+  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+  // Create the ORR-immediate instruction.
+  uint64_t Encoding = 0;
+  ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+          .addOperand(MI.getOperand(0))
+          .addReg(ARM64::XZR)
+          .addImm(Encoding);
+
+  const unsigned DstReg = MI.getOperand(0).getReg();
+  const bool DstIsDead = MI.getOperand(0).isDead();
+
+  const bool SingleMovk = SecondMovkIdx == NotSet;
+  // Create the first MOVK instruction.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+          .addReg(DstReg,
+                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, FirstMovkIdx))
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16));
+
+  // Early exit in case we only need to emit a single MOVK instruction.
+  if (SingleMovk) {
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Create the second MOVK instruction.
+  MachineInstrBuilder MIB2 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, SecondMovkIdx))
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16));
+
+  transferImpOps(MI, MIB, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned BitSize) {
+  MachineInstr &MI = *MBBI;
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
+  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+            .addOperand(MI.getOperand(0))
+            .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR)
+            .addImm(Encoding);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // Since we can't materialize the constant with a single ORR instruction,
+  // let's see whether we can materialize 3/4 of the constant with an ORR
+  // instruction and use an additional MOVK instruction to materialize the
+  // remaining 1/4.
+  //
+  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+  //
+  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+  // we would create the following instruction sequence:
+  //
+  // ORR x0, xzr, |A|X|A|X|
+  // MOVK x0, |B|, LSL #16
+  //
+  // Only look at 64-bit constants which can't be materialized with a single
+  // instruction e.g. which have less than either three all zero or all one
+  // chunks.
+  //
+  // Ignore 32-bit constants here, they always can be materialized with a
+  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+  // Thus we fall back to the default code below which in the best case creates
+  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+  //
+  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+    // identical?
+    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 3 into element 1.
+      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 1, 3);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+        return true;
+
+      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+      // identical?
+    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 2 into element 0.
+      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 0, 2);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+        return true;
+    }
+  }
+
+  // Check for identical 16-bit chunks within the constant and if so materialize
+  // them with a single ORR instruction. The remaining one or two 16-bit chunks
+  // will be materialized with MOVK instructions.
+  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Check whether the constant contains a sequence of contiguous ones, which
+  // might be interrupted by one or two chunks. If so, materialize the sequence
+  // of contiguous ones with an ORR instruction. Materialize the chunks which
+  // are either interrupting the sequence or outside of the sequence with a
+  // MOVK instruction.
+  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+  // more MOVK instructions to insert additional 16-bit portions into the
+  // lower bits.
+  bool isNeg = false;
+
+  // Use MOVN to materialize the high bits if we have more all one chunks
+  // than all zero chunks.
+  if (OneChunks > ZeroChunks) {
+    isNeg = true;
+    Imm = ~Imm;
+  }
+
+  unsigned FirstOpc;
+  if (BitSize == 32) {
+    Imm &= (1LL << 32) - 1;
+    FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi);
+  } else {
+    FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi);
+  }
+  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
+  unsigned LastShift = 0; // LSL amount for last MOVK
+  if (Imm != 0) {
+    unsigned LZ = countLeadingZeros(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
+    Shift = ((63 - LZ) / 16) * 16;
+    LastShift = (TZ / 16) * 16;
+  }
+  unsigned Imm16 = (Imm >> Shift) & Mask;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+          .addReg(DstReg, RegState::Define |
+                              getDeadRegState(DstIsDead && Shift == LastShift))
+          .addImm(Imm16)
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+
+  // If a MOVN was used for the high bits of a negative value, flip the rest
+  // of the bits back for use with MOVK.
+  if (isNeg)
+    Imm = ~Imm;
+
+  if (Shift == LastShift) {
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstrBuilder MIB2;
+  unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi);
+  while (Shift != LastShift) {
+    Shift -= 16;
+    Imm16 = (Imm >> Shift) & Mask;
+    if (Imm16 == (isNeg ? Mask : 0))
+      continue; // This 16-bit portion is already set correctly.
+    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+               .addReg(DstReg,
+                       RegState::Define |
+                           getDeadRegState(DstIsDead && Shift == LastShift))
+               .addReg(DstReg)
+               .addImm(Imm16)
+               .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+  }
+
+  transferImpOps(MI, MIB1, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+
+  case ARM64::ADDWrr:
+  case ARM64::SUBWrr:
+  case ARM64::ADDXrr:
+  case ARM64::SUBXrr:
+  case ARM64::ADDSWrr:
+  case ARM64::SUBSWrr:
+  case ARM64::ADDSXrr:
+  case ARM64::SUBSXrr:
+  case ARM64::ANDWrr:
+  case ARM64::ANDXrr:
+  case ARM64::BICWrr:
+  case ARM64::BICXrr:
+  case ARM64::EONWrr:
+  case ARM64::EONXrr:
+  case ARM64::EORWrr:
+  case ARM64::EORXrr:
+  case ARM64::ORNWrr:
+  case ARM64::ORNXrr:
+  case ARM64::ORRWrr:
+  case ARM64::ORRXrr: {
+    unsigned Opcode;
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    case ARM64::ADDWrr:      Opcode = ARM64::ADDWrs; break;
+    case ARM64::SUBWrr:      Opcode = ARM64::SUBWrs; break;
+    case ARM64::ADDXrr:      Opcode = ARM64::ADDXrs; break;
+    case ARM64::SUBXrr:      Opcode = ARM64::SUBXrs; break;
+    case ARM64::ADDSWrr:     Opcode = ARM64::ADDSWrs; break;
+    case ARM64::SUBSWrr:     Opcode = ARM64::SUBSWrs; break;
+    case ARM64::ADDSXrr:     Opcode = ARM64::ADDSXrs; break;
+    case ARM64::SUBSXrr:     Opcode = ARM64::SUBSXrs; break;
+    case ARM64::ANDWrr:      Opcode = ARM64::ANDWrs; break;
+    case ARM64::ANDXrr:      Opcode = ARM64::ANDXrs; break;
+    case ARM64::BICWrr:      Opcode = ARM64::BICWrs; break;
+    case ARM64::BICXrr:      Opcode = ARM64::BICXrs; break;
+    case ARM64::EONWrr:      Opcode = ARM64::EONWrs; break;
+    case ARM64::EONXrr:      Opcode = ARM64::EONXrs; break;
+    case ARM64::EORWrr:      Opcode = ARM64::EORWrs; break;
+    case ARM64::EORXrr:      Opcode = ARM64::EORXrs; break;
+    case ARM64::ORNWrr:      Opcode = ARM64::ORNWrs; break;
+    case ARM64::ORNXrr:      Opcode = ARM64::ORNXrs; break;
+    case ARM64::ORRWrr:      Opcode = ARM64::ORRWrs; break;
+    case ARM64::ORRXrr:      Opcode = ARM64::ORRXrs; break;
+    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+                MI.getOperand(0).getReg())
+            .addOperand(MI.getOperand(1))
+            .addOperand(MI.getOperand(2))
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case ARM64::FCVTSHpseudo: {
+    MachineOperand Src = MI.getOperand(1);
+    Src.setImplicit();
+    unsigned SrcH = TII->getRegisterInfo().getSubReg(Src.getReg(), ARM64::hsub);
+    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::FCVTSHr))
+                   .addOperand(MI.getOperand(0))
+                   .addReg(SrcH, RegState::Undef)
+                   .addOperand(Src);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+  case ARM64::LOADgot: {
+    // Expand into ADRP + LDR.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    const MachineOperand &MO1 = MI.getOperand(1);
+    unsigned Flags = MO1.getTargetFlags();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg);
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg);
+
+    if (MO1.isGlobal()) {
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE);
+      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+                            Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+    } else if (MO1.isSymbol()) {
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE);
+      MIB2.addExternalSymbol(MO1.getSymbolName(),
+                             Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+    } else {
+      assert(MO1.isCPI() &&
+             "Only expect globals, externalsymbols, or constant pools");
+      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | ARM64II::MO_PAGE);
+      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+    }
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case ARM64::MOVaddr:
+  case ARM64::MOVaddrJT:
+  case ARM64::MOVaddrCP:
+  case ARM64::MOVaddrBA:
+  case ARM64::MOVaddrTLS:
+  case ARM64::MOVaddrEXT: {
+    // Expand into ADRP + ADD.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg)
+            .addOperand(MI.getOperand(1));
+
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg)
+            .addOperand(MI.getOperand(2))
+            .addImm(0);
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case ARM64::MOVi32imm:
+    return expandMOVImm(MBB, MBBI, 32);
+  case ARM64::MOVi64imm:
+    return expandMOVImm(MBB, MBBI, 64);
+  case ARM64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET))
+        .addReg(ARM64::LR);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createARM64ExpandPseudoPass() {
+  return new ARM64ExpandPseudo();
+}
diff --git a/lib/Target/ARM64/ARM64FastISel.cpp b/lib/Target/ARM64/ARM64FastISel.cpp
new file mode 100644
index 0000000..51b0f76
--- /dev/null
+++ b/lib/Target/ARM64/ARM64FastISel.cpp
@@ -0,0 +1,1929 @@
+//===-- ARM6464FastISel.cpp - ARM64 FastISel implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// ARM64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64.h"
+#include "ARM64TargetMachine.h"
+#include "ARM64Subtarget.h"
+#include "ARM64CallingConv.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+namespace {
+
+class ARM64FastISel : public FastISel {
+
+  class Address {
+  public:
+    typedef enum {
+      RegBase,
+      FrameIndexBase
+    } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+    int64_t Offset;
+
+  public:
+    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index  access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+    void setOffset(int64_t O) { Offset = O; }
+    int64_t getOffset() { return Offset; }
+
+    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+  };
+
+  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARM64Subtarget *Subtarget;
+  LLVMContext *Context;
+
+private:
+  // Selection routines.
+  bool SelectLoad(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+  bool SelectBranch(const Instruction *I);
+  bool SelectIndirectBr(const Instruction *I);
+  bool SelectCmp(const Instruction *I);
+  bool SelectSelect(const Instruction *I);
+  bool SelectFPExt(const Instruction *I);
+  bool SelectFPTrunc(const Instruction *I);
+  bool SelectFPToInt(const Instruction *I, bool Signed);
+  bool SelectIntToFP(const Instruction *I, bool Signed);
+  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
+  bool SelectCall(const Instruction *I, const char *IntrMemName);
+  bool SelectIntrinsicCall(const IntrinsicInst &I);
+  bool SelectRet(const Instruction *I);
+  bool SelectTrunc(const Instruction *I);
+  bool SelectIntExt(const Instruction *I);
+  bool SelectMul(const Instruction *I);
+
+  // Utility helper routines.
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
+                       bool UseUnscaled);
+  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            unsigned Flags, bool UseUnscaled);
+  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                          unsigned Alignment);
+  // Emit functions.
+  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                bool UseUnscaled = false);
+  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 bool UseUnscaled = false);
+  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+
+  unsigned ARM64MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned ARM64MaterializeGV(const GlobalValue *GV);
+
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
+                       SmallVectorImpl<unsigned> &ArgRegs,
+                       SmallVectorImpl<MVT> &ArgVTs,
+                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+                       unsigned &NumBytes);
+  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+
+public:
+  // Backend specific FastISel code.
+  virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
+  virtual unsigned TargetMaterializeConstant(const Constant *C);
+
+  explicit ARM64FastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &TM.getSubtarget<ARM64Subtarget>();
+    Context = &funcInfo.Fn->getContext();
+  }
+
+  virtual bool TargetSelectInstruction(const Instruction *I);
+
+#include "ARM64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "ARM64GenCallingConv.inc"
+
+CCAssignFn *ARM64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  if (CC == CallingConv::WebKit_JS)
+    return CC_ARM64_WebKit_JS;
+  return Subtarget->isTargetDarwin() ? CC_ARM64_DarwinPCS : CC_ARM64_AAPCS;
+}
+
+unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+         "Alloca should always return a pointer.");
+
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI))
+    return 0;
+
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&ARM64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
+            ResultReg)
+        .addFrameIndex(SI->second)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = (VT == MVT::f64);
+
+  // This checks to see if we can use FMOV instructions to materialize
+  // a constant, otherwise we have to materialize via the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = ARM64_AM::getFP64Imm(Val);
+      Opc = ARM64::FMOVDi;
+    } else {
+      Imm = ARM64_AM::getFP32Imm(Val);
+      Opc = ARM64::FMOVSi;
+    }
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addImm(Imm);
+    return ResultReg;
+  }
+
+  // Materialize via constant pool.  MachineConstantPool wants an explicit
+  // alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0)
+    Align = DL.getTypeAllocSize(CFP->getType());
+
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned ADRPReg = createResultReg(&ARM64::GPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
+          ADRPReg).addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGE);
+
+  unsigned Opc = is64bit ? ARM64::LDRDui : ARM64::LDRSui;
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(ADRPReg)
+      .addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+  return ResultReg;
+}
+
+unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
+  // We can't handle thread-local variables quickly yet. Unfortunately we have
+  // to peer through any aliases to find out if that rule applies.
+  const GlobalValue *TLSGV = GV;
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    TLSGV = GA->getAliasedGlobal();
+
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
+    if (GVar->isThreadLocal())
+      return 0;
+
+  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+  EVT DestEVT = TLI.getValueType(GV->getType(), true);
+  if (!DestEVT.isSimple())
+    return 0;
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  unsigned ADRPReg = createResultReg(&ARM64::GPR64RegClass);
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+
+  if (OpFlags & ARM64II::MO_GOT) {
+    // ADRP + LDRX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
+            ADRPReg)
+        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGE);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::LDRXui),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGEOFF |
+                                     ARM64II::MO_NC);
+  } else {
+    // ADRP + ADDX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
+            ADRPReg).addGlobalAddress(GV, 0, ARM64II::MO_PAGE);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC)
+        .addImm(0);
+  }
+  return ResultReg;
+}
+
+unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  // FIXME: Handle ConstantInt.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return ARM64MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return ARM64MaterializeGV(GV);
+
+  return 0;
+}
+
+// Computes the address to get to an object.
+bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return ComputeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    gep_type_iterator GTI = gep_type_begin(U);
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+         ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (ComputeAddress(U->getOperand(0), Addr))
+      return true;
+
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (!Addr.isValid())
+    Addr.setReg(getRegForValue(Obj));
+  return Addr.isValid();
+}
+
+bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now. For stores, this reflects truncation.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
+                                    bool UseUnscaled) {
+  bool needsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (!UseUnscaled)
+      // Using scaled, 12-bit, unsigned immediate offsets.
+      needsLowering = ((Offset & 0xfff) != Offset);
+    else
+      // Using unscaled, 9-bit, signed immediate offsets.
+      needsLowering = (Offset > 256 || Offset < -256);
+    break;
+  }
+
+  // FIXME: If this is a stack pointer and the offset needs to be simplified
+  // then put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
+    return false;
+  }
+
+  // Since the offset is too large for the load/store instruction get the
+  // reg+offset into a register.
+  if (needsLowering) {
+    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
+    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
+                                      UnscaledOffset, MVT::i64);
+    if (ResultReg == 0)
+      return false;
+    Addr.setReg(ResultReg);
+    Addr.setOffset(0);
+  }
+  return true;
+}
+
+void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
+                                         const MachineInstrBuilder &MIB,
+                                         unsigned Flags, bool UseUnscaled) {
+  int64_t Offset = Addr.getOffset();
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.getKind() == Address::FrameIndexBase) {
+    int FI = Addr.getFI();
+    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
+    // and alignment should be based on the VT.
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.getReg());
+    MIB.addImm(Offset);
+  }
+}
+
+bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                             bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  // Intentional fall-through.
+  case MVT::i8:
+    Opc = UseUnscaled ? ARM64::LDURBBi : ARM64::LDRBBui;
+    RC = &ARM64::GPR32RegClass;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    Opc = UseUnscaled ? ARM64::LDURHHi : ARM64::LDRHHui;
+    RC = &ARM64::GPR32RegClass;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    Opc = UseUnscaled ? ARM64::LDURWi : ARM64::LDRWui;
+    RC = &ARM64::GPR32RegClass;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    Opc = UseUnscaled ? ARM64::LDURXi : ARM64::LDRXui;
+    RC = &ARM64::GPR64RegClass;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    Opc = UseUnscaled ? ARM64::LDURSi : ARM64::LDRSui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    Opc = UseUnscaled ? ARM64::LDURDi : ARM64::LDRDui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Create the base instruction, then add the operands.
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+
+  // Loading an i1 requires special handling.
+  if (VTIsi1) {
+    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+            ANDReg)
+        .addReg(ResultReg)
+        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+    ResultReg = ANDReg;
+  }
+  return true;
+}
+
+bool ARM64FastISel::SelectLoad(const Instruction *I) {
+  MVT VT;
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                              bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned StrOpc;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  // Using scaled, 12-bit, unsigned immediate offsets.
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    StrOpc = UseUnscaled ? ARM64::STURBBi : ARM64::STRBBui;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    StrOpc = UseUnscaled ? ARM64::STURHHi : ARM64::STRHHui;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    StrOpc = UseUnscaled ? ARM64::STURWi : ARM64::STRWui;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    StrOpc = UseUnscaled ? ARM64::STURXi : ARM64::STRXui;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    StrOpc = UseUnscaled ? ARM64::STURSi : ARM64::STRSui;
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    StrOpc = UseUnscaled ? ARM64::STURDi : ARM64::STRDui;
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Storing an i1 requires special handling.
+  if (VTIsi1) {
+    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+            ANDReg)
+        .addReg(SrcReg)
+        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+    SrcReg = ANDReg;
+  }
+  // Create the base instruction, then add the operands.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(StrOpc)).addReg(SrcReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+  return true;
+}
+
+bool ARM64FastISel::SelectStore(const Instruction *I) {
+  MVT VT;
+  Value *Op0 = I->getOperand(0);
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+      cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Get the value to be stored into a register.
+  unsigned SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr))
+    return false;
+  return true;
+}
+
+static ARM64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return ARM64CC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return ARM64CC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return ARM64CC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return ARM64CC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return ARM64CC::HI;
+  case CmpInst::FCMP_OLT:
+    return ARM64CC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return ARM64CC::LS;
+  case CmpInst::FCMP_ORD:
+    return ARM64CC::VC;
+  case CmpInst::FCMP_UNO:
+    return ARM64CC::VS;
+  case CmpInst::FCMP_UGE:
+    return ARM64CC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return ARM64CC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return ARM64CC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return ARM64CC::NE;
+  case CmpInst::ICMP_UGE:
+    return ARM64CC::CS;
+  case CmpInst::ICMP_ULT:
+    return ARM64CC::CC;
+  }
+}
+
+bool ARM64FastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+      // We may not handle every CC for now.
+      ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
+      if (CC == ARM64CC::AL)
+        return false;
+
+      // Emit the cmp.
+      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SrcVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+      unsigned CondReg = getRegForValue(TI->getOperand(0));
+      if (CondReg == 0)
+        return false;
+
+      // Issue an extract_subreg to get the lower 32-bits.
+      if (SrcVT == MVT::i64)
+        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+                                             ARM64::sub_32);
+
+      unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+              ANDReg)
+          .addReg(CondReg)
+          .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
+          .addReg(ANDReg)
+          .addReg(ANDReg)
+          .addImm(0)
+          .addImm(0);
+
+      unsigned CC = ARM64CC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CC = ARM64CC::EQ;
+      }
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+                 dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::B))
+        .addMBB(Target);
+    FuncInfo.MBB->addSuccessor(Target);
+    return true;
+  }
+
+  unsigned CondReg = getRegForValue(BI->getCondition());
+  if (CondReg == 0)
+    return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri),
+          ARM64::WZR)
+      .addReg(CondReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned CC = ARM64CC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CC = ARM64CC::EQ;
+  }
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+      .addImm(CC)
+      .addMBB(TBB);
+  FuncInfo.MBB->addSuccessor(TBB);
+  FastEmitBranch(FBB, DbgLoc);
+  return true;
+}
+
+bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
+  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+  unsigned AddrReg = getRegForValue(BI->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  // Emit the indirect branch.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BR))
+      .addReg(AddrReg);
+
+  // Make sure the CFG is up-to-date.
+  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+
+  return true;
+}
+
+bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  uint64_t Imm;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+
+      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
+      if (CIVal.isNegative()) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      // FIXME: We can handle more immediates using shifts.
+      UseImm = ((Imm & 0xfff) == Imm);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned ZReg;
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    needsExt = true;
+  // Intentional fall-through.
+  case MVT::i32:
+    ZReg = ARM64::WZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? ARM64::ADDSWri : ARM64::SUBSWri;
+    else
+      CmpOpc = ARM64::SUBSWrr;
+    break;
+  case MVT::i64:
+    ZReg = ARM64::XZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? ARM64::ADDSXri : ARM64::SUBSXri;
+    else
+      CmpOpc = ARM64::SUBSXrr;
+    break;
+  case MVT::f32:
+    isICmp = false;
+    CmpOpc = UseImm ? ARM64::FCMPSri : ARM64::FCMPSrr;
+    break;
+  case MVT::f64:
+    isICmp = false;
+    CmpOpc = UseImm ? ARM64::FCMPDri : ARM64::FCMPDrr;
+    break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0)
+      return false;
+    if (!UseImm) {
+      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0)
+        return false;
+    }
+  }
+
+  if (isICmp) {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addImm(Imm)
+          .addImm(0);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  } else {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  }
+  return true;
+}
+
+bool ARM64FastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // We may not handle every CC for now.
+  ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
+  if (CC == ARM64CC::AL)
+    return false;
+
+  // Emit the cmp.
+  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison.
+  ARM64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::CSINCWr),
+          ResultReg)
+      .addReg(ARM64::WZR)
+      .addReg(ARM64::WZR)
+      .addImm(invertedCC);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::SelectSelect(const Instruction *I) {
+  const SelectInst *SI = cast<SelectInst>(I);
+
+  EVT DestEVT = TLI.getValueType(SI->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
+      DestVT != MVT::f64)
+    return false;
+
+  unsigned CondReg = getRegForValue(SI->getCondition());
+  if (CondReg == 0)
+    return false;
+  unsigned TrueReg = getRegForValue(SI->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+  unsigned FalseReg = getRegForValue(SI->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+  unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+          ANDReg)
+      .addReg(CondReg)
+      .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
+      .addReg(ANDReg)
+      .addReg(ANDReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned SelectOpc;
+  switch (DestVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i32:
+    SelectOpc = ARM64::CSELWr;
+    break;
+  case MVT::i64:
+    SelectOpc = ARM64::CSELXr;
+    break;
+  case MVT::f32:
+    SelectOpc = ARM64::FCSELSrrr;
+    break;
+  case MVT::f64:
+    SelectOpc = ARM64::FCSELDrrr;
+    break;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
+          ResultReg)
+      .addReg(TrueReg)
+      .addReg(FalseReg)
+      .addImm(ARM64CC::NE);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::SelectFPExt(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&ARM64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTDSr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&ARM64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTSDr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// FPToUI and FPToSI
+bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+
+  unsigned Opc;
+  if (SrcVT == MVT::f64) {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWDr : ARM64::FCVTZSUXDr;
+    else
+      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWDr : ARM64::FCVTZUUXDr;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWSr : ARM64::FCVTZSUXSr;
+    else
+      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWSr : ARM64::FCVTZUUXSr;
+  }
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+    SrcReg =
+        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (SrcReg == 0)
+      return false;
+  }
+
+  unsigned Opc;
+  if (SrcVT == MVT::i64) {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUXSri : ARM64::SCVTFUXDri;
+    else
+      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUXSri : ARM64::UCVTFUXDri;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUWSri : ARM64::SCVTFUWDri;
+    else
+      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUWSri : ARM64::UCVTFUWDri;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
+                                    SmallVectorImpl<unsigned> &ArgRegs,
+                                    SmallVectorImpl<MVT> &ArgVTs,
+                                    SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                    SmallVectorImpl<unsigned> &RegArgs,
+                                    CallingConv::ID CC, unsigned &NumBytes) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+      .addImm(NumBytes);
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
+      if (Arg == 0)
+        return false;
+      ArgVT = DestVT;
+      break;
+    }
+    case CCValAssign::AExt:
+    // Intentional fall-through.
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
+      if (Arg == 0)
+        return false;
+      ArgVT = DestVT;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // FIXME: Handle custom args.
+      return false;
+    } else {
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+
+      // Need to store on the stack.
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(ARM64::SP);
+      Addr.setOffset(VA.getLocMemOffset());
+
+      if (!EmitStore(ArgVT, Arg, Addr))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                               const Instruction *I, CallingConv::ID CC,
+                               unsigned &NumBytes) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+      .addImm(NumBytes)
+      .addImm(0);
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    UsedRegs.push_back(RVLocs[0].getLocReg());
+
+    // Finally update the result.
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+bool ARM64FastISel::SelectCall(const Instruction *I,
+                               const char *IntrMemName = 0) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Don't handle inline asm or intrinsics.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Only handle global variable Callees.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // Let SDISel handle vararg functions.
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  MVT RetVT;
+  Type *RetTy = I->getType();
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value *, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgRegs.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e - i <= 2)
+      break;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    MVT ArgVT;
+    Type *ArgTy = (*i)->getType();
+    if (!isTypeLegal(ArgTy, ArgVT) &&
+        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BL));
+  if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARM64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+  if (Alignment)
+    return Len / Alignment <= 4;
+  else
+    return Len < 32;
+}
+
+bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                                       unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemCpySmall(Len, Alignment))
+    return false;
+
+  int64_t UnscaledOffset = 0;
+  Address OrigDest = Dest;
+  Address OrigSrc = Src;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 8) {
+      if (Len >= 8)
+        VT = MVT::i64;
+      else if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 4 && Alignment == 4)
+        VT = MVT::i32;
+      else if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = EmitLoad(VT, ResultReg, Src);
+    assert(RV == true && "Should be able to handle this load.");
+    RV = EmitStore(VT, ResultReg, Dest);
+    assert(RV == true && "Should be able to handle this store.");
+    (void)RV;
+
+    int64_t Size = VT.getSizeInBits() / 8;
+    Len -= Size;
+    UnscaledOffset += Size;
+
+    // We need to recompute the unscaled offset for each iteration.
+    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+  }
+
+  return true;
+}
+
+bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      unsigned Alignment = MTI.getAlignment();
+      if (IsMemCpySmall(Len, Alignment)) {
+        Address Dest, Src;
+        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
+            !ComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BRK))
+        .addImm(1);
+    return true;
+  }
+  }
+  return false;
+}
+
+bool ARM64FastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    CallingConv::ID CC = F.getCallingConv();
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+                   I->getContext());
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
+                                                     : RetCC_ARM64_AAPCS;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+    MVT RVVT = RVEVT.getSimpleVT();
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      bool isZExt = Outs[0].Flags.isZExt();
+      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+      if (SrcReg == 0)
+        return false;
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(ARM64::RET_ReallyLR));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+bool ARM64FastISel::SelectTrunc(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  Type *SrcTy = Op->getType();
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+      DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+
+  // If we're truncating from i64 to a smaller non-legal type then generate an
+  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
+  // generate any code.
+  if (SrcVT == MVT::i64) {
+    uint64_t Mask = 0;
+    switch (DestVT.SimpleTy) {
+    default:
+      // Trunc i64 to i32 is handled by the target-independent fast-isel.
+      return false;
+    case MVT::i1:
+      Mask = 0x1;
+      break;
+    case MVT::i8:
+      Mask = 0xff;
+      break;
+    case MVT::i16:
+      Mask = 0xffff;
+      break;
+    }
+    // Issue an extract_subreg to get the lower 32-bits.
+    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+                                                ARM64::sub_32);
+    // Create the AND instruction which performs the actual truncation.
+    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+            ANDReg)
+        .addReg(Reg32)
+        .addImm(ARM64_AM::encodeLogicalImmediate(Mask, 32));
+    SrcReg = ANDReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+          DestVT == MVT::i64) &&
+         "Unexpected value type.");
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+
+  if (isZExt) {
+    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+
+    if (DestVT == MVT::i64) {
+      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
+      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
+      unsigned Reg64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(ARM64::SUBREG_TO_REG), Reg64)
+          .addImm(0)
+          .addReg(ResultReg)
+          .addImm(ARM64::sub_32);
+      ResultReg = Reg64;
+    }
+    return ResultReg;
+  } else {
+    if (DestVT == MVT::i64) {
+      // FIXME: We're SExt i1 to i64.
+      return 0;
+    }
+    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SBFMWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+}
+
+unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                   bool isZExt) {
+  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+  unsigned Opc;
+  unsigned Imm = 0;
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    return Emiti1Ext(SrcReg, DestVT, isZExt);
+  case MVT::i8:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+    else
+      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
+    Imm = 7;
+    break;
+  case MVT::i16:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+    else
+      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
+    Imm = 15;
+    break;
+  case MVT::i32:
+    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+    Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+    Imm = 31;
+    break;
+  }
+
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg)
+      .addImm(0)
+      .addImm(Imm);
+
+  return ResultReg;
+}
+
+bool ARM64FastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.  The high bits for a type smaller than
+  // the register size are assumed to be undefined.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0)
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i64 && DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  bool is64bit = (DestVT == MVT::i64);
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SREM:
+    DivOpc = is64bit ? ARM64::SDIVXr : ARM64::SDIVWr;
+    break;
+  case ISD::UREM:
+    DivOpc = is64bit ? ARM64::UDIVXr : ARM64::UDIVWr;
+    break;
+  }
+  unsigned MSubOpc = is64bit ? ARM64::MSUBXrrr : ARM64::MSUBWrrr;
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), ResultReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg);
+  // The remainder is computed as numerator – (quotient * denominator) using the
+  // MSUB instruction.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
+      .addReg(ResultReg)
+      .addReg(Src1Reg)
+      .addReg(Src0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::SelectMul(const Instruction *I) {
+  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Must be simple value type.  Don't handle vectors.
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+
+  unsigned Opc;
+  unsigned ZReg;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    ZReg = ARM64::WZR;
+    Opc = ARM64::MADDWrrr;
+    break;
+  case MVT::i64:
+    ZReg = ARM64::XZR;
+    Opc = ARM64::MADDXrrr;
+    break;
+  }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  // Create the base instruction, then add the operands.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg)
+      .addReg(ZReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return SelectLoad(I);
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Br:
+    return SelectBranch(I);
+  case Instruction::IndirectBr:
+    return SelectIndirectBr(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return SelectCmp(I);
+  case Instruction::Select:
+    return SelectSelect(I);
+  case Instruction::FPExt:
+    return SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return SelectFPTrunc(I);
+  case Instruction::FPToSI:
+    return SelectFPToInt(I, /*Signed=*/true);
+  case Instruction::FPToUI:
+    return SelectFPToInt(I, /*Signed=*/false);
+  case Instruction::SIToFP:
+    return SelectIntToFP(I, /*Signed=*/true);
+  case Instruction::UIToFP:
+    return SelectIntToFP(I, /*Signed=*/false);
+  case Instruction::SRem:
+    return SelectRem(I, ISD::SREM);
+  case Instruction::URem:
+    return SelectRem(I, ISD::UREM);
+  case Instruction::Call:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      return SelectIntrinsicCall(*II);
+    return SelectCall(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  case Instruction::Trunc:
+    return SelectTrunc(I);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return SelectIntExt(I);
+  case Instruction::Mul:
+    // FIXME: This really should be handled by the target-independent selector.
+    return SelectMul(I);
+  }
+  return false;
+  // Silence warnings.
+  (void)&CC_ARM64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *ARM64::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) {
+  return new ARM64FastISel(funcInfo, libInfo);
+}
+}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
new file mode 100644
index 0000000..798986c
--- /dev/null
+++ b/lib/Target/ARM64/ARM64FrameLowering.cpp
@@ -0,0 +1,816 @@
+//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "frame-info"
+#include "ARM64FrameLowering.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64Subtarget.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableRedZone("arm64-redzone",
+                                   cl::desc("enable use of redzone on ARM64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
+  }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
+}
+
+bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on ARM64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void ARM64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const ARM64InstrInfo *TII =
+      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+  if (!TFI->hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc DL = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = TFI->getStackAlignment();
+      Amount = (Amount + Align - 1) / Align * Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      if (Opc == ARM64::ADJCALLSTACKDOWN) {
+        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
+      } else {
+        assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
+        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+void
+ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator MBBI,
+                                              unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const ARM64InstrInfo *TII = TM.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const ARM64InstrInfo *TII = TM.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
+
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
+    }
+
+    return;
+  }
+
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == ARM64::STPXpre ||
+            MBBI->getOpcode() == ARM64::STPDpre) &&
+           MBBI->getOperand(2).getReg() == ARM64::SP &&
+           MBBI->getOperand(3).getImm() < 0 &&
+           (MBBI->getOperand(3).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(3).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
+
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == ARM64::STPXi ||
+         MBBI->getOpcode() == ARM64::STPDi ||
+         MBBI->getOpcode() == ARM64::STPXpre ||
+         MBBI->getOpcode() == ARM64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
+
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
+
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
+  }
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
+  if (MI->getOpcode() == ARM64::LDPXpost ||
+      MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi ||
+      MI->getOpcode() == ARM64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(1).getReg(), CSRegs) ||
+        MI->getOperand(2).getReg() != ARM64::SP)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARM64InstrInfo *TII =
+      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const ARM64RegisterInfo *RegInfo =
+      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  int NumBytes = MFI->getStackSize();
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
+  }
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII);
+    return;
+  }
+
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                            int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                               int FI,
+                                               unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
+
+int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                   int FI, unsigned &FrameReg,
+                                                   bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARM64RegisterInfo *RegInfo =
+      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
+    }
+  }
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
+  }
+
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = ARM64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
+  }
+
+  return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != ARM64::LR)
+    return getKillRegState(true);
+
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
+}
+
+bool ARM64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (ARM64::GPR64RegClass.contains(Reg1)) {
+      assert(ARM64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = ARM64::STPXpre;
+      else
+        StrOpc = ARM64::STPXi;
+    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
+      assert(ARM64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = ARM64::STPDpre;
+      else
+        StrOpc = ARM64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    BuildMI(MBB, MI, DL, TII.get(StrOpc))
+        .addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(ARM64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+  return true;
+}
+
+bool ARM64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (ARM64::GPR64RegClass.contains(Reg1)) {
+      assert(ARM64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = ARM64::LDPXpost;
+      else
+        LdrOpc = ARM64::LDPXi;
+    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
+      assert(ARM64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = ARM64::LDPDpost;
+      else
+        LdrOpc = ARM64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    BuildMI(MBB, MI, DL, TII.get(LdrOpc))
+        .addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(ARM64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
+
+void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const ARM64RegisterInfo *RegInfo =
+      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
+
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(ARM64::FP);
+    MRI->setPhysRegUsed(ARM64::LR);
+  }
+
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((ARM64::GPR64RegClass.contains(OddReg) &&
+            ARM64::GPR64RegClass.contains(EvenReg)) ^
+               (ARM64::FPR64RegClass.contains(OddReg) &&
+                ARM64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (ARM64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
+      } else {
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
+      }
+      continue;
+    }
+
+    unsigned Reg = ARM64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
+    }
+
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (ARM64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
+  }
+
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
+
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &ARM64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
+    }
+  }
+}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.h b/lib/Target/ARM64/ARM64FrameLowering.h
new file mode 100644
index 0000000..02edcdb
--- /dev/null
+++ b/lib/Target/ARM64/ARM64FrameLowering.h
@@ -0,0 +1,75 @@
+//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64_FRAMELOWERING_H
+#define ARM64_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class ARM64Subtarget;
+class ARM64TargetMachine;
+
+class ARM64FrameLowering : public TargetFrameLowering {
+  const ARM64TargetMachine &TM;
+
+public:
+  explicit ARM64FrameLowering(const ARM64TargetMachine &TM,
+                              const ARM64Subtarget &STI)
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/),
+        TM(TM) {}
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
new file mode 100644
index 0000000..2e234c9
--- /dev/null
+++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
@@ -0,0 +1,2381 @@
+//===-- ARM64ISelDAGToDAG.cpp - A dag to dag inst selector for ARM64 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-isel"
+#include "ARM64TargetMachine.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===--------------------------------------------------------------------===//
+/// ARM64DAGToDAGISel - ARM64 specific code to select ARM64 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+class ARM64DAGToDAGISel : public SelectionDAGISel {
+  ARM64TargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARM64Subtarget *Subtarget;
+
+  bool ForCodeSize;
+
+public:
+  explicit ARM64DAGToDAGISel(ARM64TargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel), TM(tm),
+        Subtarget(&TM.getSubtarget<ARM64Subtarget>()), ForCodeSize(false) {}
+
+  virtual const char *getPassName() const {
+    return "ARM64 Instruction Selection";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) {
+    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+    ForCodeSize =
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                             Attribute::OptimizeForSize) ||
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  SDNode *Select(SDNode *Node);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+
+  SDNode *SelectMLAV64LaneV128(SDNode *N);
+  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
+  }
+
+  bool SelectAddrModeRO8(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &Imm) {
+    return SelectAddrModeRO(N, 1, Base, Offset, Imm);
+  }
+  bool SelectAddrModeRO16(SDValue N, SDValue &Base, SDValue &Offset,
+                          SDValue &Imm) {
+    return SelectAddrModeRO(N, 2, Base, Offset, Imm);
+  }
+  bool SelectAddrModeRO32(SDValue N, SDValue &Base, SDValue &Offset,
+                          SDValue &Imm) {
+    return SelectAddrModeRO(N, 4, Base, Offset, Imm);
+  }
+  bool SelectAddrModeRO64(SDValue N, SDValue &Base, SDValue &Offset,
+                          SDValue &Imm) {
+    return SelectAddrModeRO(N, 8, Base, Offset, Imm);
+  }
+  bool SelectAddrModeRO128(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Imm) {
+    return SelectAddrModeRO(N, 16, Base, Offset, Imm);
+  }
+  bool SelectAddrModeNoIndex(SDValue N, SDValue &Val);
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
+                      unsigned SubRegs[]);
+
+  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+
+  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
+  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
+
+  SDNode *SelectAtomic(SDNode *Node, unsigned Op8, unsigned Op16, unsigned Op32,
+                       unsigned Op64);
+
+  SDNode *SelectBitfieldExtractOp(SDNode *N);
+  SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+  SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "ARM64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeRO(SDValue N, unsigned Size, SDValue &Base,
+                        SDValue &Offset, SDValue &Imm);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, SDValue &Offset,
+                         SDValue &Imm);
+};
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool ARM64DAGToDAGISel::SelectAddrModeNoIndex(SDValue N, SDValue &Val) {
+  EVT ValTy = N.getValueType();
+  if (ValTy != MVT::i64)
+    return false;
+  Val = N;
+  return true;
+}
+
+bool ARM64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all ARM64
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool ARM64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                         SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt);
+  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+  return true;
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool ARM64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                            SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static ARM64_AM::ShiftType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return ARM64_AM::InvalidShift;
+  case ISD::SHL:
+    return ARM64_AM::LSL;
+  case ISD::SRL:
+    return ARM64_AM::LSR;
+  case ISD::SRA:
+    return ARM64_AM::ASR;
+  case ISD::ROTR:
+    return ARM64_AM::ROR;
+  }
+}
+
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool ARM64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the a value is used at least twice, unless we are optimizing
+  // for code size.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool ARM64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                              SDValue &Reg, SDValue &Shift) {
+  ARM64_AM::ShiftType ShType = getShiftTypeForNode(N);
+  if (ShType == ARM64_AM::InvalidShift)
+    return false;
+  if (!AllowROR && ShType == ARM64_AM::ROR)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = ARM64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static ARM64_AM::ExtendType getExtendTypeForNode(SDValue N,
+                                                 bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return ARM64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return ARM64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return ARM64_AM::SXTW;
+    else if (SrcVT == MVT::i64)
+      return ARM64_AM::SXTX;
+
+    return ARM64_AM::InvalidExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return ARM64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return ARM64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return ARM64_AM::UXTW;
+    else if (SrcVT == MVT::i64)
+      return ARM64_AM::UXTX;
+
+    return ARM64_AM::InvalidExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return ARM64_AM::InvalidExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return ARM64_AM::InvalidExtend;
+    case 0xFF:
+      return !IsLoadStore ? ARM64_AM::UXTB : ARM64_AM::InvalidExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? ARM64_AM::UXTH : ARM64_AM::InvalidExtend;
+    case 0xFFFFFFFF:
+      return ARM64_AM::UXTW;
+    }
+  }
+
+  return ARM64_AM::InvalidExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != ARM64ISD::DUPLANE16 &&
+      DL->getOpcode() != ARM64ISD::DUPLANE32)
+    return false;
+
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return false;
+
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
+  return true;
+}
+
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - ARM64 supports 64-bit vector MLAs (v4i16 and v2i32)
+/// where one multiplicand is a lane in the upper half of a 128-bit vector.
+/// Recognize and select this so that we don't emit unnecessary lane extracts.
+SDNode *ARM64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return 0;
+  }
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
+
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = ARM64::MLAv4i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = ARM64::MLAv2i32_indexed;
+    break;
+  }
+
+  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+SDNode *ARM64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return 0;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::arm64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = ARM64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = ARM64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::arm64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = ARM64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = ARM64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool ARM64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                    SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  ARM64_AM::ExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if ((ShiftVal & 0x3) != ShiftVal)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == ARM64_AM::InvalidExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == ARM64_AM::InvalidExtend)
+      return false;
+
+    Reg = N.getOperand(0);
+  }
+
+  // ARM64 mandates that the RHS of the operation must use the smallest
+  // register classs that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  if (Reg.getValueType() == MVT::i64 && Ext != ARM64_AM::UXTX &&
+      Ext != ARM64_AM::SXTX) {
+    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
+    MachineSDNode *Node = CurDAG->getMachineNode(
+        TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, Reg, SubReg);
+    Reg = SDValue(Node, 0);
+  }
+
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  return isWorthFolding(N);
+}
+
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool ARM64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
+  }
+
+  if (N.getOpcode() == ARM64ISD::ADDlow) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    const DataLayout *DL = TLI->getDataLayout();
+    if (Alignment == 0 && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+
+    if (Alignment >= Size)
+      return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
+}
+
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool ARM64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                               SDValue &Base, SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      return true;
+    }
+  }
+  return false;
+}
+
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+      0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+static SDValue WidenIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32) {
+    return Widen(CurDAG, N);
+  }
+
+  return N;
+}
+
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool ARM64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                          SDValue &Offset, SDValue &Imm) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (CSD && (CSD->getZExtValue() & 0x7) == CSD->getZExtValue()) {
+
+    ARM64_AM::ExtendType Ext = getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == ARM64_AM::InvalidExtend) {
+      Ext = ARM64_AM::UXTX;
+      Offset = WidenIfNeeded(CurDAG, N.getOperand(0));
+    } else {
+      Offset = WidenIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    }
+
+    unsigned LegalShiftVal = Log2_32(Size);
+    unsigned ShiftVal = CSD->getZExtValue();
+
+    if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+      return false;
+
+    Imm = CurDAG->getTargetConstant(
+        ARM64_AM::getMemExtendImm(Ext, ShiftVal != 0), MVT::i32);
+    if (isWorthFolding(N))
+      return true;
+  }
+  return false;
+}
+
+bool ARM64DAGToDAGISel::SelectAddrModeRO(SDValue N, unsigned Size,
+                                         SDValue &Base, SDValue &Offset,
+                                         SDValue &Imm) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
+       UI != UE; ++UI) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, Offset, Imm)) {
+    Base = LHS;
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, Offset, Imm)) {
+    Base = RHS;
+    return true;
+  }
+
+  ARM64_AM::ExtendType Ext = ARM64_AM::UXTX;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) != ARM64_AM::InvalidExtend) {
+    Base = RHS;
+    Offset = WidenIfNeeded(CurDAG, LHS.getOperand(0));
+    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
+                                    MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) != ARM64_AM::InvalidExtend) {
+    Base = LHS;
+    Offset = WidenIfNeeded(CurDAG, RHS.getOperand(0));
+    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
+                                    MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
+
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = WidenIfNeeded(CurDAG, RHS);
+  Ext = ARM64_AM::UXTX;
+  Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
+                                  MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
+}
+
+SDValue ARM64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { ARM64::DDRegClassID, ARM64::DDDRegClassID,
+                                    ARM64::DDDDRegClassID };
+  static unsigned SubRegs[] = { ARM64::dsub0, ARM64::dsub1,
+                                ARM64::dsub2, ARM64::dsub3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue ARM64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { ARM64::QQRegClassID, ARM64::QQQRegClassID,
+                                    ARM64::QQQQRegClassID };
+  static unsigned SubRegs[] = { ARM64::qsub0, ARM64::qsub1,
+                                ARM64::qsub2, ARM64::qsub3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue ARM64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                       unsigned RegClassIDs[],
+                                       unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0].getNode());
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+SDNode *ARM64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+                                       unsigned Opc, bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  unsigned ExtOff = isExt;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+SDNode *ARM64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return NULL;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? ARM64::LDRXpre_isel : ARM64::LDRXpost_isel;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? ARM64::LDRSWpre_isel : ARM64::LDRSWpost_isel;
+    else {
+      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? ARM64::LDRSHXpre_isel : ARM64::LDRSHXpost_isel;
+      else
+        Opcode = IsPre ? ARM64::LDRSHWpre_isel : ARM64::LDRSHWpost_isel;
+    } else {
+      Opcode = IsPre ? ARM64::LDRHHpre_isel : ARM64::LDRHHpost_isel;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? ARM64::LDRSBXpre_isel : ARM64::LDRSBXpost_isel;
+      else
+        Opcode = IsPre ? ARM64::LDRSBWpre_isel : ARM64::LDRSBWpost_isel;
+    } else {
+      Opcode = IsPre ? ARM64::LDRBBpre_isel : ARM64::LDRBBpost_isel;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? ARM64::LDRSpre_isel : ARM64::LDRSpost_isel;
+  } else if (VT == MVT::f64) {
+    Opcode = IsPre ? ARM64::LDRDpre_isel : ARM64::LDRDpost_isel;
+  } else
+    return NULL;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), DstVT, MVT::i64,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  Done = true;
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
+    SDNode *Sub = CurDAG->getMachineNode(
+        ARM64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+        CurDAG->getTargetConstant(0, MVT::i64), SDValue(Res, 0), SubReg);
+    ReplaceUses(SDValue(N, 0), SDValue(Sub, 0));
+    ReplaceUses(SDValue(N, 1), SDValue(Res, 1));
+    ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+    return 0;
+  }
+  return Res;
+}
+
+SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                      unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(2)); // Mem operand;
+  Ops.push_back(Chain);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  // MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  // MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  // cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+  switch (NumVecs) {
+  case 4:
+    ReplaceUses(SDValue(N, 3), CurDAG->getTargetExtractSubreg(SubRegIdx + 3, dl,
+                                                              VT, SuperReg));
+  // FALLTHROUGH
+  case 3:
+    ReplaceUses(SDValue(N, 2), CurDAG->getTargetExtractSubreg(SubRegIdx + 2, dl,
+                                                              VT, SuperReg));
+  // FALLTHROUGH
+  case 2:
+    ReplaceUses(SDValue(N, 1), CurDAG->getTargetExtractSubreg(SubRegIdx + 1, dl,
+                                                              VT, SuperReg));
+    ReplaceUses(SDValue(N, 0),
+                CurDAG->getTargetExtractSubreg(SubRegIdx, dl, VT, SuperReg));
+    break;
+  case 1:
+    ReplaceUses(SDValue(N, 0), SuperReg);
+    break;
+  }
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+  return 0;
+}
+
+SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                       unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 2));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
+
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(ARM64::dsub, DL, WideTy, Undef, V64Reg);
+  }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(ARM64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  switch (NumVecs) {
+  case 4: {
+    SDValue NV3 =
+        CurDAG->getTargetExtractSubreg(ARM64::qsub3, dl, WideVT, SuperReg);
+    if (Narrow)
+      ReplaceUses(SDValue(N, 3), NarrowVector(NV3, *CurDAG));
+    else
+      ReplaceUses(SDValue(N, 3), NV3);
+  }
+  // FALLTHROUGH
+  case 3: {
+    SDValue NV2 =
+        CurDAG->getTargetExtractSubreg(ARM64::qsub2, dl, WideVT, SuperReg);
+    if (Narrow)
+      ReplaceUses(SDValue(N, 2), NarrowVector(NV2, *CurDAG));
+    else
+      ReplaceUses(SDValue(N, 2), NV2);
+  }
+  // FALLTHROUGH
+  case 2: {
+    SDValue NV1 =
+        CurDAG->getTargetExtractSubreg(ARM64::qsub1, dl, WideVT, SuperReg);
+    SDValue NV0 =
+        CurDAG->getTargetExtractSubreg(ARM64::qsub0, dl, WideVT, SuperReg);
+    if (Narrow) {
+      ReplaceUses(SDValue(N, 1), NarrowVector(NV1, *CurDAG));
+      ReplaceUses(SDValue(N, 0), NarrowVector(NV0, *CurDAG));
+    } else {
+      ReplaceUses(SDValue(N, 1), NV1);
+      ReplaceUses(SDValue(N, 0), NV0);
+    }
+    break;
+  }
+  }
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+  return Ld;
+}
+
+SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                           unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+SDNode *ARM64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
+                                        unsigned Op16, unsigned Op32,
+                                        unsigned Op64) {
+  // Mostly direct translation to the given operations, except that we preserve
+  // the AtomicOrdering for use later on.
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
+  EVT VT = AN->getMemoryVT();
+
+  unsigned Op;
+  if (VT == MVT::i8)
+    Op = Op8;
+  else if (VT == MVT::i16)
+    Op = Op16;
+  else if (VT == MVT::i32)
+    Op = Op32;
+  else if (VT == MVT::i64)
+    Op = Op64;
+  else
+    llvm_unreachable("Unexpected atomic operation");
+
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
+    Ops.push_back(AN->getOperand(i));
+
+  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
+  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+
+  return CurDAG->SelectNodeTo(Node, Op, AN->getValueType(0), MVT::Other,
+                              &Ops[0], Ops.size());
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t And_imm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (And_imm & (And_imm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t Srl_imm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+         "bad amount in shift node!");
+
+  LSB = Srl_imm;
+  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+                                  : CountTrailingOnes_64(And_imm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? ARM64::UBFMWri : ARM64::UBFMXri;
+  return true;
+}
+
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     unsigned &LSB, unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts a single
+  // bit from the source value and places it in the LSB of the destination
+  // value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm == 1.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, ShiftImm
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t And_mask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  // Check whether we really have a one bit extract here.
+  if (And_mask >> Srl_imm == 0x1) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = ARM64::UBFMWri;
+    else
+      Opc = ARM64::UBFMXri;
+
+    LSB = MSB = Srl_imm;
+
+    return true;
+  }
+
+  return false;
+}
+
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing a one bit extract.
+  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+    return true;
+
+  // we're looking for a shift of a shift
+  uint64_t Shl_imm = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  // Note: The width operand is encoded as width-1.
+  unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
+  int sLSB = Srl_imm - Shl_imm;
+  if (sLSB < 0)
+    return false;
+  LSB = sLSB;
+  MSB = LSB + Width;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMWri : ARM64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMXri : ARM64::UBFMXri;
+  return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+    if (!N->isMachineOpcode())
+      return false;
+    break;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case ARM64::SBFMWri:
+  case ARM64::UBFMWri:
+  case ARM64::SBFMXri:
+  case ARM64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, LSB, MSB;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+    return NULL;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
+}
+
+// Is mask a i32 or i64 binary sequence 1..10..0 and
+// CountTrailingZeros(mask) == ExpectedTrailingZeros
+static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros,
+                       unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+
+  uint64_t ExpectedMask;
+  if (VT == MVT::i32) {
+    uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros;
+    ExpectedMask = ExpectedMaski32;
+    if (NumberOfIgnoredHighBits) {
+      uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits);
+      Mask |= highMask;
+    }
+  } else {
+    ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros;
+    if (NumberOfIgnoredHighBits)
+      Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits);
+  }
+
+  return Mask == ExpectedMask;
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = ARM64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSR) {
+    // Shift Right
+    // We do not handle ARM64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  if (Op.getOperand(1) == Orig)
+    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    UsefulBits &= ~OpUsefulBits;
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case ARM64::ANDSWri:
+  case ARM64::ANDSXri:
+  case ARM64::ANDWri:
+  case ARM64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case ARM64::UBFMWri:
+  case ARM64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case ARM64::ORRWrs:
+  case ARM64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case ARM64::BFMWri:
+  case ARM64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode::use_iterator UseIt = Op.getNode()->use_begin(),
+                            UseEnd = Op.getNode()->use_end();
+       UseIt != UseEnd; ++UseIt) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(*UseIt, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+//                       isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+//                 countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     SDValue &Opd1, unsigned &LSB,
+                                     unsigned &MSB, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  // Set Opc
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i32)
+    Opc = ARM64::BFMWri;
+  else if (VT == MVT::i64)
+    Opc = ARM64::BFMXri;
+  else
+    return false;
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+  APInt UsefulBits;
+  getUsefulBits(SDValue(N, 0), UsefulBits);
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // OR is commutative, check both possibilities (does llvm provide a
+  // way to do that directely, e.g., via code matcher?)
+  SDValue OrOpd1Val = N->getOperand(1);
+  SDNode *OrOpd0 = N->getOperand(0).getNode();
+  SDNode *OrOpd1 = N->getOperand(1).getNode();
+  for (int i = 0; i < 2;
+       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+    unsigned BFXOpc;
+    // Set Opd1, LSB and MSB arguments by looking for
+    // c = ubfm b, imm, imm2
+    if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB,
+                             NumberOfIgnoredLowBits, true))
+      continue;
+
+    // Check that the returned opcode is compatible with the pattern,
+    // i.e., same type and zero extended (U and not S)
+    if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
+        (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
+      continue;
+
+    // Compute the width of the bitfield insertion
+    int sMSB = MSB - LSB + 1;
+    // FIXME: This constraints is to catch bitfield insertion we may
+    // want to widen the pattern if we want to grab general bitfied
+    // move case
+    if (sMSB <= 0)
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    if (VT != MVT::i32 && VT != MVT::i64)
+      continue;
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->ComputeMaskedBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    if (KnownZero.countTrailingOnes() < (unsigned)sMSB)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Opd0 = OrOpd1->getOperand(0);
+    else
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Opd0 = OrOpd1Val;
+
+    // both parts match
+    return true;
+  }
+
+  return false;
+}
+
+SDNode *ARM64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return NULL;
+
+  unsigned Opc;
+  unsigned LSB, MSB;
+  SDValue Opd0, Opd1;
+
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+    return NULL;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0,
+                    Opd1,
+                    CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 4);
+}
+
+SDNode *ARM64DAGToDAGISel::SelectLIBM(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned Variant;
+  unsigned Opc;
+  unsigned FRINTXOpcs[] = { ARM64::FRINTXSr, ARM64::FRINTXDr };
+
+  if (VT == MVT::f32) {
+    Variant = 0;
+  } else if (VT == MVT::f64) {
+    Variant = 1;
+  } else
+    return 0; // Unrecognized argument type. Fall back on default codegen.
+
+  // Pick the FRINTX variant needed to set the flags.
+  unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+  switch (N->getOpcode()) {
+  default:
+    return 0; // Unrecognized libm ISD node. Fall back on default codegen.
+  case ISD::FCEIL: {
+    unsigned FRINTPOpcs[] = { ARM64::FRINTPSr, ARM64::FRINTPDr };
+    Opc = FRINTPOpcs[Variant];
+    break;
+  }
+  case ISD::FFLOOR: {
+    unsigned FRINTMOpcs[] = { ARM64::FRINTMSr, ARM64::FRINTMDr };
+    Opc = FRINTMOpcs[Variant];
+    break;
+  }
+  case ISD::FTRUNC: {
+    unsigned FRINTZOpcs[] = { ARM64::FRINTZSr, ARM64::FRINTZDr };
+    Opc = FRINTZOpcs[Variant];
+    break;
+  }
+  case ISD::FROUND: {
+    unsigned FRINTAOpcs[] = { ARM64::FRINTASr, ARM64::FRINTADr };
+    Opc = FRINTAOpcs[Variant];
+    break;
+  }
+  }
+
+  SDLoc dl(N);
+  SDValue In = N->getOperand(0);
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(In);
+
+  if (!TM.Options.UnsafeFPMath) {
+    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+    Ops.push_back(SDValue(FRINTX, 1));
+  }
+
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return NULL;
+  }
+
+  // Few custom selection stuff.
+  SDNode *ResNode = 0;
+  EVT VT = Node->getValueType(0);
+
+  switch (Node->getOpcode()) {
+  default:
+    break;
+
+  case ISD::ADD:
+    if (SDNode *I = SelectMLAV64LaneV128(Node))
+      return I;
+    break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_ADD_I8,
+                        ARM64::ATOMIC_LOAD_ADD_I16, ARM64::ATOMIC_LOAD_ADD_I32,
+                        ARM64::ATOMIC_LOAD_ADD_I64);
+  case ISD::ATOMIC_LOAD_SUB:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_SUB_I8,
+                        ARM64::ATOMIC_LOAD_SUB_I16, ARM64::ATOMIC_LOAD_SUB_I32,
+                        ARM64::ATOMIC_LOAD_SUB_I64);
+  case ISD::ATOMIC_LOAD_AND:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_AND_I8,
+                        ARM64::ATOMIC_LOAD_AND_I16, ARM64::ATOMIC_LOAD_AND_I32,
+                        ARM64::ATOMIC_LOAD_AND_I64);
+  case ISD::ATOMIC_LOAD_OR:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_OR_I8,
+                        ARM64::ATOMIC_LOAD_OR_I16, ARM64::ATOMIC_LOAD_OR_I32,
+                        ARM64::ATOMIC_LOAD_OR_I64);
+  case ISD::ATOMIC_LOAD_XOR:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_XOR_I8,
+                        ARM64::ATOMIC_LOAD_XOR_I16, ARM64::ATOMIC_LOAD_XOR_I32,
+                        ARM64::ATOMIC_LOAD_XOR_I64);
+  case ISD::ATOMIC_LOAD_NAND:
+    return SelectAtomic(
+        Node, ARM64::ATOMIC_LOAD_NAND_I8, ARM64::ATOMIC_LOAD_NAND_I16,
+        ARM64::ATOMIC_LOAD_NAND_I32, ARM64::ATOMIC_LOAD_NAND_I64);
+  case ISD::ATOMIC_LOAD_MIN:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_MIN_I8,
+                        ARM64::ATOMIC_LOAD_MIN_I16, ARM64::ATOMIC_LOAD_MIN_I32,
+                        ARM64::ATOMIC_LOAD_MIN_I64);
+  case ISD::ATOMIC_LOAD_MAX:
+    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_MAX_I8,
+                        ARM64::ATOMIC_LOAD_MAX_I16, ARM64::ATOMIC_LOAD_MAX_I32,
+                        ARM64::ATOMIC_LOAD_MAX_I64);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return SelectAtomic(
+        Node, ARM64::ATOMIC_LOAD_UMIN_I8, ARM64::ATOMIC_LOAD_UMIN_I16,
+        ARM64::ATOMIC_LOAD_UMIN_I32, ARM64::ATOMIC_LOAD_UMIN_I64);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return SelectAtomic(
+        Node, ARM64::ATOMIC_LOAD_UMAX_I8, ARM64::ATOMIC_LOAD_UMAX_I16,
+        ARM64::ATOMIC_LOAD_UMAX_I32, ARM64::ATOMIC_LOAD_UMAX_I64);
+  case ISD::ATOMIC_SWAP:
+    return SelectAtomic(Node, ARM64::ATOMIC_SWAP_I8, ARM64::ATOMIC_SWAP_I16,
+                        ARM64::ATOMIC_SWAP_I32, ARM64::ATOMIC_SWAP_I64);
+  case ISD::ATOMIC_CMP_SWAP:
+    return SelectAtomic(Node, ARM64::ATOMIC_CMP_SWAP_I8,
+                        ARM64::ATOMIC_CMP_SWAP_I16, ARM64::ATOMIC_CMP_SWAP_I32,
+                        ARM64::ATOMIC_CMP_SWAP_I64);
+
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    bool Done = false;
+    SDNode *I = SelectIndexedLoad(Node, Done);
+    if (Done)
+      return I;
+    break;
+  }
+
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+    if (SDNode *I = SelectBitfieldExtractOp(Node))
+      return I;
+    break;
+
+  case ISD::OR:
+    if (SDNode *I = SelectBitfieldInsertOp(Node))
+      return I;
+    break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      assert(0 && "Unexpected vector element type!");
+    case 64:
+      SubReg = ARM64::dsub;
+      break;
+    case 32:
+      SubReg = ARM64::ssub;
+      break;
+    case 16: // FALLTHROUGH
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    return Extract.getNode();
+  }
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      ARM64::WZR, MVT::i32).getNode();
+      else if (VT == MVT::i64)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      ARM64::XZR, MVT::i64).getNode();
+    }
+    break;
+  }
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    return CurDAG->SelectNodeTo(Node, ARM64::ADDXri, MVT::i64, Ops, 3);
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm64_ldxp: {
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(ARM64::LDXPX, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      return Ld;
+    }
+    case Intrinsic::arm64_stxp: {
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(ValLo);
+      Ops.push_back(ValHi);
+      Ops.push_back(MemAddr);
+      Ops.push_back(Chain);
+
+      SDNode *St =
+          CurDAG->getMachineNode(ARM64::STXPX, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+    case Intrinsic::arm64_neon_ld1x2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, ARM64::LD1Twov8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, ARM64::LD1Twov16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, ARM64::LD1Twov4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, ARM64::LD1Twov8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, ARM64::LD1Twov2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, ARM64::LD1Twov4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, ARM64::LD1Twov2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld1x3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, ARM64::LD1Threev8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, ARM64::LD1Threev16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, ARM64::LD1Threev4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, ARM64::LD1Threev8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, ARM64::LD1Threev2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, ARM64::LD1Threev4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, ARM64::LD1Threev2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld1x4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, ARM64::LD2Twov8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, ARM64::LD2Twov16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, ARM64::LD2Twov4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, ARM64::LD2Twov8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, ARM64::LD2Twov2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, ARM64::LD2Twov4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, ARM64::LD2Twov2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, ARM64::LD3Threev8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, ARM64::LD3Threev16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, ARM64::LD3Threev4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, ARM64::LD3Threev8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, ARM64::LD3Threev2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, ARM64::LD3Threev4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, ARM64::LD3Threev2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, ARM64::LD4Fourv2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld2r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, ARM64::LD2Rv8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, ARM64::LD2Rv16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, ARM64::LD2Rv4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, ARM64::LD2Rv8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, ARM64::LD2Rv2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, ARM64::LD2Rv4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, ARM64::LD2Rv1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, ARM64::LD2Rv2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld3r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, ARM64::LD3Rv8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, ARM64::LD3Rv16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, ARM64::LD3Rv4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, ARM64::LD3Rv8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, ARM64::LD3Rv2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, ARM64::LD3Rv4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, ARM64::LD3Rv1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, ARM64::LD3Rv2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld4r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, ARM64::LD4Rv8b, ARM64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, ARM64::LD4Rv16b, ARM64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, ARM64::LD4Rv4h, ARM64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, ARM64::LD4Rv8h, ARM64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, ARM64::LD4Rv2s, ARM64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, ARM64::LD4Rv4s, ARM64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, ARM64::LD4Rv1d, ARM64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, ARM64::LD4Rv2d, ARM64::qsub0);
+      break;
+    case Intrinsic::arm64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 2, ARM64::LD2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 2, ARM64::LD2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 2, ARM64::LD2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 2, ARM64::LD2i64);
+      break;
+    case Intrinsic::arm64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 3, ARM64::LD3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 3, ARM64::LD3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 3, ARM64::LD3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 3, ARM64::LD3i64);
+      break;
+    case Intrinsic::arm64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 4, ARM64::LD4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 4, ARM64::LD4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 4, ARM64::LD4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 4, ARM64::LD4i64);
+      break;
+    }
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm64_neon_tbl2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBLv8i8Two
+                                                  : ARM64::TBLv16i8Two,
+                         false);
+    case Intrinsic::arm64_neon_tbl3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBLv8i8Three
+                                                  : ARM64::TBLv16i8Three,
+                         false);
+    case Intrinsic::arm64_neon_tbl4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBLv8i8Four
+                                                  : ARM64::TBLv16i8Four,
+                         false);
+    case Intrinsic::arm64_neon_tbx2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBXv8i8Two
+                                                  : ARM64::TBXv16i8Two,
+                         true);
+    case Intrinsic::arm64_neon_tbx3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBXv8i8Three
+                                                  : ARM64::TBXv16i8Three,
+                         true);
+    case Intrinsic::arm64_neon_tbx4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBXv8i8Four
+                                                  : ARM64::TBXv16i8Four,
+                         true);
+    case Intrinsic::arm64_neon_smull:
+    case Intrinsic::arm64_neon_umull:
+      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+        return N;
+      break;
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm64_neon_st1x2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, ARM64::ST1Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, ARM64::ST1Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, ARM64::ST1Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, ARM64::ST1Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, ARM64::ST1Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, ARM64::ST1Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, ARM64::ST1Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, ARM64::ST1Twov1d);
+      break;
+    }
+    case Intrinsic::arm64_neon_st1x3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, ARM64::ST1Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, ARM64::ST1Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, ARM64::ST1Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, ARM64::ST1Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, ARM64::ST1Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, ARM64::ST1Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, ARM64::ST1Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, ARM64::ST1Threev1d);
+      break;
+    }
+    case Intrinsic::arm64_neon_st1x4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, ARM64::ST1Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, ARM64::ST1Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, ARM64::ST1Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, ARM64::ST1Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, ARM64::ST1Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, ARM64::ST1Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, ARM64::ST1Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
+      break;
+    }
+    case Intrinsic::arm64_neon_st2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, ARM64::ST2Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, ARM64::ST2Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, ARM64::ST2Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, ARM64::ST2Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, ARM64::ST2Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, ARM64::ST2Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, ARM64::ST2Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, ARM64::ST1Twov1d);
+      break;
+    }
+    case Intrinsic::arm64_neon_st3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, ARM64::ST3Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, ARM64::ST3Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, ARM64::ST3Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, ARM64::ST3Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, ARM64::ST3Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, ARM64::ST3Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, ARM64::ST3Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, ARM64::ST1Threev1d);
+      break;
+    }
+    case Intrinsic::arm64_neon_st4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, ARM64::ST4Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, ARM64::ST4Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, ARM64::ST4Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, ARM64::ST4Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, ARM64::ST4Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, ARM64::ST4Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, ARM64::ST4Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
+      break;
+    }
+    case Intrinsic::arm64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 2, ARM64::ST2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 2, ARM64::ST2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 2, ARM64::ST2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 2, ARM64::ST2i64);
+      break;
+    }
+    case Intrinsic::arm64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 3, ARM64::ST3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 3, ARM64::ST3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 3, ARM64::ST3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 3, ARM64::ST3i64);
+      break;
+    }
+    case Intrinsic::arm64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 4, ARM64::ST4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 4, ARM64::ST4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 4, ARM64::ST4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 4, ARM64::ST4i64);
+      break;
+    }
+    }
+  }
+
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FTRUNC:
+  case ISD::FROUND:
+    if (SDNode *I = SelectLIBM(Node))
+      return I;
+    break;
+  }
+
+  // Select the default instruction
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  return ResNode;
+}
+
+/// createARM64ISelDag - This pass converts a legalized DAG into a
+/// ARM64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createARM64ISelDag(ARM64TargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel) {
+  return new ARM64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
new file mode 100644
index 0000000..641f591
--- /dev/null
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -0,0 +1,7551 @@
+//===-- ARM64ISelLowering.cpp - ARM64 DAG Lowering Implementation  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-lower"
+
+#include "ARM64ISelLowering.h"
+#include "ARM64PerfectShuffle.h"
+#include "ARM64Subtarget.h"
+#include "ARM64CallingConv.h"
+#include "ARM64MachineFunctionInfo.h"
+#include "ARM64TargetMachine.h"
+#include "ARM64TargetObjectFile.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+// This option should go away when tail calls fully work.
+static cl::opt<bool>
+EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
+                     cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
+                     cl::init(true));
+
+static cl::opt<bool>
+StrictAlign("arm64-strict-align", cl::Hidden,
+            cl::desc("Disallow all unaligned memory accesses"));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableARM64ExtrGeneration("arm64-extr-generation", cl::Hidden,
+                          cl::desc("Allow ARM64 (or (shift)(shift))->extract"),
+                          cl::init(true));
+
+static cl::opt<bool>
+EnableARM64SlrGeneration("arm64-shift-insert-generation", cl::Hidden,
+                         cl::desc("Allow ARM64 SLI/SRI formation"),
+                         cl::init(false));
+
+//===----------------------------------------------------------------------===//
+// ARM64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<ARM64Subtarget>().isTargetDarwin())
+    return new ARM64_MachoTargetObjectFile();
+
+  return new ARM64_ELFTargetObjectFile();
+}
+
+ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<ARM64Subtarget>();
+
+  // ARM64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass);
+  addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
+  addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
+  addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
+  addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
+  addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
+
+  // Someone set us up the NEON.
+  addDRTypeForNEON(MVT::v2f32);
+  addDRTypeForNEON(MVT::v8i8);
+  addDRTypeForNEON(MVT::v4i16);
+  addDRTypeForNEON(MVT::v2i32);
+  addDRTypeForNEON(MVT::v1i64);
+  addDRTypeForNEON(MVT::v1f64);
+
+  addQRTypeForNEON(MVT::v4f32);
+  addQRTypeForNEON(MVT::v2f64);
+  addQRTypeForNEON(MVT::v16i8);
+  addQRTypeForNEON(MVT::v8i16);
+  addQRTypeForNEON(MVT::v4i32);
+  addQRTypeForNEON(MVT::v2i64);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+
+  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+  // silliness like this:
+  setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+  setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+  setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+  setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+  setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+  setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+  setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+  setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+  setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+  setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+  setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+  setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
+
+  // Virtually no operation on f128 is legal, but LLVM can't expand them when
+  // there's a valid register class, so we need custom operations in most cases.
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
+  // Lowering for many of the conversions is actually specified by the non-f128
+  // type. The LowerXXX function will be trivial when f128 isn't involved.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+
+  // 128-bit atomics
+  setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i128, Custom);
+  // These are surprisingly difficult. The only single-copy atomic 128-bit
+  // instruction on AArch64 is stxp (when it succeeds). So a store can safely
+  // become a simple swap, but a load can only be determined to have been atomic
+  // if storing the same value back succeeds.
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Expand);
+
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // Exception handling.
+  // FIXME: These are guesses. Has this been defined yet?
+  setExceptionPointerRegister(ARM64::X0);
+  setExceptionSelectorRegister(ARM64::X1);
+
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+  // Add/Sub overflow ops with MVT::Glues are lowered to CPSR dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // ARM64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+
+  // ARM64 doesn't have a direct vector ->f32 conversion instructions for
+  // elements smaller than i32, so promote the input to i32 first.
+  setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+
+  // ARM64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+  // ARM64 doesn't have MUL.2d:
+  setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+
+  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+  // counterparts, which ARM64 supports directly.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // ARM64 has implementations of a lot of rounding-like FP operations.
+  static MVT RoundingTypes[] = { MVT::f32,   MVT::f64,  MVT::v2f32,
+                                 MVT::v4f32, MVT::v2f64 };
+  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+    MVT Ty = RoundingTypes[I];
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+  }
+
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  }
+
+  // ARM64 does not have floating-point extending loads, i1 sign-extending load,
+  // floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+  }
+
+  // Likewise, narrowing and extending vector loads/stores aren't handled
+  // directly.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                       Expand);
+
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction((MVT::SimpleValueType)VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
+
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
+
+  setTargetDAGCombine(ISD::MUL);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+
+  setStackPointerRegisterToSaveRestore(ARM64::SP);
+
+  setSchedulingPreference(Sched::Hybrid);
+
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
+
+  setMinFunctionAlignment(2);
+
+  RequireStrictAlign = StrictAlign;
+}
+
+void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+  }
+
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SCALAR_TO_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+}
+
+void ARM64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
+
+void ARM64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
+
+EVT ARM64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// computeMaskedBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void ARM64TargetLowering::computeMaskedBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case ARM64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.ComputeMaskedBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.ComputeMaskedBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN:
+    break;
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm64_neon_umaxv:
+    case Intrinsic::arm64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
+
+MVT ARM64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::i64;
+}
+
+unsigned ARM64TargetLowering::getMaximalGlobalOffset() const {
+  // FIXME: On ARM64, this depends on the type.
+  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  return 4095;
+}
+
+FastISel *
+ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                    const TargetLibraryInfo *libInfo) const {
+  return ARM64::createFastISel(funcInfo, libInfo);
+}
+
+const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return 0;
+  case ARM64ISD::CALL:              return "ARM64ISD::CALL";
+  case ARM64ISD::ADRP:              return "ARM64ISD::ADRP";
+  case ARM64ISD::ADDlow:            return "ARM64ISD::ADDlow";
+  case ARM64ISD::LOADgot:           return "ARM64ISD::LOADgot";
+  case ARM64ISD::RET_FLAG:          return "ARM64ISD::RET_FLAG";
+  case ARM64ISD::BRCOND:            return "ARM64ISD::BRCOND";
+  case ARM64ISD::CSEL:              return "ARM64ISD::CSEL";
+  case ARM64ISD::FCSEL:             return "ARM64ISD::FCSEL";
+  case ARM64ISD::CSINV:             return "ARM64ISD::CSINV";
+  case ARM64ISD::CSNEG:             return "ARM64ISD::CSNEG";
+  case ARM64ISD::CSINC:             return "ARM64ISD::CSINC";
+  case ARM64ISD::THREAD_POINTER:    return "ARM64ISD::THREAD_POINTER";
+  case ARM64ISD::TLSDESC_CALL:      return "ARM64ISD::TLSDESC_CALL";
+  case ARM64ISD::ADC:               return "ARM64ISD::ADC";
+  case ARM64ISD::SBC:               return "ARM64ISD::SBC";
+  case ARM64ISD::ADDS:              return "ARM64ISD::ADDS";
+  case ARM64ISD::SUBS:              return "ARM64ISD::SUBS";
+  case ARM64ISD::ADCS:              return "ARM64ISD::ADCS";
+  case ARM64ISD::SBCS:              return "ARM64ISD::SBCS";
+  case ARM64ISD::ANDS:              return "ARM64ISD::ANDS";
+  case ARM64ISD::FCMP:              return "ARM64ISD::FCMP";
+  case ARM64ISD::FMIN:              return "ARM64ISD::FMIN";
+  case ARM64ISD::FMAX:              return "ARM64ISD::FMAX";
+  case ARM64ISD::DUP:               return "ARM64ISD::DUP";
+  case ARM64ISD::DUPLANE8:          return "ARM64ISD::DUPLANE8";
+  case ARM64ISD::DUPLANE16:         return "ARM64ISD::DUPLANE16";
+  case ARM64ISD::DUPLANE32:         return "ARM64ISD::DUPLANE32";
+  case ARM64ISD::DUPLANE64:         return "ARM64ISD::DUPLANE64";
+  case ARM64ISD::MOVI:              return "ARM64ISD::MOVI";
+  case ARM64ISD::MOVIshift:         return "ARM64ISD::MOVIshift";
+  case ARM64ISD::MOVIedit:          return "ARM64ISD::MOVIedit";
+  case ARM64ISD::MOVImsl:           return "ARM64ISD::MOVImsl";
+  case ARM64ISD::FMOV:              return "ARM64ISD::FMOV";
+  case ARM64ISD::MVNIshift:         return "ARM64ISD::MVNIshift";
+  case ARM64ISD::MVNImsl:           return "ARM64ISD::MVNImsl";
+  case ARM64ISD::BICi:              return "ARM64ISD::BICi";
+  case ARM64ISD::ORRi:              return "ARM64ISD::ORRi";
+  case ARM64ISD::NEG:               return "ARM64ISD::NEG";
+  case ARM64ISD::EXTR:              return "ARM64ISD::EXTR";
+  case ARM64ISD::ZIP1:              return "ARM64ISD::ZIP1";
+  case ARM64ISD::ZIP2:              return "ARM64ISD::ZIP2";
+  case ARM64ISD::UZP1:              return "ARM64ISD::UZP1";
+  case ARM64ISD::UZP2:              return "ARM64ISD::UZP2";
+  case ARM64ISD::TRN1:              return "ARM64ISD::TRN1";
+  case ARM64ISD::TRN2:              return "ARM64ISD::TRN2";
+  case ARM64ISD::REV16:             return "ARM64ISD::REV16";
+  case ARM64ISD::REV32:             return "ARM64ISD::REV32";
+  case ARM64ISD::REV64:             return "ARM64ISD::REV64";
+  case ARM64ISD::EXT:               return "ARM64ISD::EXT";
+  case ARM64ISD::VSHL:              return "ARM64ISD::VSHL";
+  case ARM64ISD::VLSHR:             return "ARM64ISD::VLSHR";
+  case ARM64ISD::VASHR:             return "ARM64ISD::VASHR";
+  case ARM64ISD::CMEQ:              return "ARM64ISD::CMEQ";
+  case ARM64ISD::CMGE:              return "ARM64ISD::CMGE";
+  case ARM64ISD::CMGT:              return "ARM64ISD::CMGT";
+  case ARM64ISD::CMHI:              return "ARM64ISD::CMHI";
+  case ARM64ISD::CMHS:              return "ARM64ISD::CMHS";
+  case ARM64ISD::FCMEQ:             return "ARM64ISD::FCMEQ";
+  case ARM64ISD::FCMGE:             return "ARM64ISD::FCMGE";
+  case ARM64ISD::FCMGT:             return "ARM64ISD::FCMGT";
+  case ARM64ISD::CMEQz:             return "ARM64ISD::CMEQz";
+  case ARM64ISD::CMGEz:             return "ARM64ISD::CMGEz";
+  case ARM64ISD::CMGTz:             return "ARM64ISD::CMGTz";
+  case ARM64ISD::CMLEz:             return "ARM64ISD::CMLEz";
+  case ARM64ISD::CMLTz:             return "ARM64ISD::CMLTz";
+  case ARM64ISD::FCMEQz:            return "ARM64ISD::FCMEQz";
+  case ARM64ISD::FCMGEz:            return "ARM64ISD::FCMGEz";
+  case ARM64ISD::FCMGTz:            return "ARM64ISD::FCMGTz";
+  case ARM64ISD::FCMLEz:            return "ARM64ISD::FCMLEz";
+  case ARM64ISD::FCMLTz:            return "ARM64ISD::FCMLTz";
+  case ARM64ISD::NOT:               return "ARM64ISD::NOT";
+  case ARM64ISD::BIT:               return "ARM64ISD::BIT";
+  case ARM64ISD::CBZ:               return "ARM64ISD::CBZ";
+  case ARM64ISD::CBNZ:              return "ARM64ISD::CBNZ";
+  case ARM64ISD::TBZ:               return "ARM64ISD::TBZ";
+  case ARM64ISD::TBNZ:              return "ARM64ISD::TBNZ";
+  case ARM64ISD::TC_RETURN:         return "ARM64ISD::TC_RETURN";
+  case ARM64ISD::SITOF:             return "ARM64ISD::SITOF";
+  case ARM64ISD::UITOF:             return "ARM64ISD::UITOF";
+  case ARM64ISD::SQSHL_I:           return "ARM64ISD::SQSHL_I";
+  case ARM64ISD::UQSHL_I:           return "ARM64ISD::UQSHL_I";
+  case ARM64ISD::SRSHR_I:           return "ARM64ISD::SRSHR_I";
+  case ARM64ISD::URSHR_I:           return "ARM64ISD::URSHR_I";
+  case ARM64ISD::SQSHLU_I:          return "ARM64ISD::SQSHLU_I";
+  case ARM64ISD::WrapperLarge:      return "ARM64ISD::WrapperLarge";
+  }
+}
+
+static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
+                                  unsigned &LdrOpc, unsigned &StrOpc) {
+  static unsigned LoadBares[] = { ARM64::LDXRB, ARM64::LDXRH, ARM64::LDXRW,
+                                  ARM64::LDXRX, ARM64::LDXPX };
+  static unsigned LoadAcqs[] = { ARM64::LDAXRB, ARM64::LDAXRH, ARM64::LDAXRW,
+                                 ARM64::LDAXRX, ARM64::LDAXPX };
+  static unsigned StoreBares[] = { ARM64::STXRB, ARM64::STXRH, ARM64::STXRW,
+                                   ARM64::STXRX, ARM64::STXPX };
+  static unsigned StoreRels[] = { ARM64::STLXRB, ARM64::STLXRH, ARM64::STLXRW,
+                                  ARM64::STLXRX, ARM64::STLXPX };
+
+  unsigned *LoadOps, *StoreOps;
+  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    LoadOps = LoadAcqs;
+  else
+    LoadOps = LoadBares;
+
+  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    StoreOps = StoreRels;
+  else
+    StoreOps = StoreBares;
+
+  assert(isPowerOf2_32(Size) && Size <= 16 &&
+         "unsupported size for atomic binary op!");
+
+  LdrOpc = LoadOps[Log2_32(Size)];
+  StrOpc = StoreOps[Log2_32(Size)];
+}
+
+MachineBasicBlock *ARM64TargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned Size) const {
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned oldval = MI->getOperand(2).getReg();
+  unsigned newval = MI->getOperand(3).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
+  unsigned scratch = BB->getParent()->getRegInfo().createVirtualRegister(
+      &ARM64::GPR32RegClass);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // FIXME: We currently always generate a seq_cst operation; we should
+  // be able to relax this in some cases.
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ldrex dest, [ptr]
+  //   cmp dest, oldval
+  //   bne exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  BuildMI(BB, dl, TII->get(Size == 8 ? ARM64::SUBSXrr : ARM64::SUBSWrr))
+      .addReg(Size == 8 ? ARM64::XZR : ARM64::WZR, RegState::Define)
+      .addReg(dest)
+      .addReg(oldval);
+  BuildMI(BB, dl, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(exitMBB);
+  BB->addSuccessor(loop2MBB);
+  BB->addSuccessor(exitMBB);
+
+  // loop2MBB:
+  //   strex scratch, newval, [ptr]
+  //   cmp scratch, #0
+  //   bne loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
+  BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARM64TargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned Size, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned scratch = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
+  unsigned scratch2 =
+      (!BinOpcode)
+          ? incr
+          : RegInfo.createVirtualRegister(Size == 8 ? &ARM64::GPR64RegClass
+                                                    : &ARM64::GPR32RegClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldxr dest, ptr
+  //   <binop> scratch2, dest, incr
+  //   stxr scratch, scratch2, ptr
+  //   cbnz scratch, loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (BinOpcode) {
+    // operand order needs to go the other way for NAND
+    if (BinOpcode == ARM64::BICWrr || BinOpcode == ARM64::BICXrr)
+      BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(incr).addReg(dest);
+    else
+      BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(dest).addReg(incr);
+  }
+
+  BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loopMBB);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *ARM64TargetLowering::EmitAtomicBinary128(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned BinOpcodeLo,
+    unsigned BinOpcodeHi) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned DestLo = MI->getOperand(0).getReg();
+  unsigned DestHi = MI->getOperand(1).getReg();
+  unsigned Ptr = MI->getOperand(2).getReg();
+  unsigned IncrLo = MI->getOperand(3).getReg();
+  unsigned IncrHi = MI->getOperand(4).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned LdrOpc, StrOpc;
+  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
+
+  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, LoopMBB);
+  MF->insert(It, ExitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  ExitMBB->splice(ExitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
+  unsigned ScratchLo = IncrLo, ScratchHi = IncrHi;
+  if (BinOpcodeLo) {
+    assert(BinOpcodeHi && "Expect neither or both opcodes to be defined");
+    ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+    ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+  }
+
+  //  ThisMBB:
+  //   ...
+  //   fallthrough --> LoopMBB
+  BB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   ldxp DestLo, DestHi, Ptr
+  //   <binoplo> ScratchLo, DestLo, IncrLo
+  //   <binophi> ScratchHi, DestHi, IncrHi
+  //   stxp ScratchRes, ScratchLo, ScratchHi, ptr
+  //   cbnz ScratchRes, LoopMBB
+  //   fallthrough --> ExitMBB
+  BB = LoopMBB;
+  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
+      .addReg(DestHi, RegState::Define)
+      .addReg(Ptr);
+  if (BinOpcodeLo) {
+    // operand order needs to go the other way for NAND
+    if (BinOpcodeLo == ARM64::BICXrr) {
+      std::swap(IncrLo, DestLo);
+      std::swap(IncrHi, DestHi);
+    }
+
+    BuildMI(BB, DL, TII->get(BinOpcodeLo), ScratchLo).addReg(DestLo).addReg(
+        IncrLo);
+    BuildMI(BB, DL, TII->get(BinOpcodeHi), ScratchHi).addReg(DestHi).addReg(
+        IncrHi);
+  }
+
+  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
+      .addReg(ScratchLo)
+      .addReg(ScratchHi)
+      .addReg(Ptr);
+  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
+
+  BB->addSuccessor(LoopMBB);
+  BB->addSuccessor(ExitMBB);
+
+  //  ExitMBB:
+  //   ...
+  BB = ExitMBB;
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARM64TargetLowering::EmitAtomicCmpSwap128(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const {
+  unsigned DestLo = MI->getOperand(0).getReg();
+  unsigned DestHi = MI->getOperand(1).getReg();
+  unsigned Ptr = MI->getOperand(2).getReg();
+  unsigned OldValLo = MI->getOperand(3).getReg();
+  unsigned OldValHi = MI->getOperand(4).getReg();
+  unsigned NewValLo = MI->getOperand(5).getReg();
+  unsigned NewValHi = MI->getOperand(6).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(7).getImm());
+  unsigned ScratchRes = BB->getParent()->getRegInfo().createVirtualRegister(
+      &ARM64::GPR32RegClass);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned LdrOpc, StrOpc;
+  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *Loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, Loop1MBB);
+  MF->insert(It, Loop2MBB);
+  MF->insert(It, ExitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  ExitMBB->splice(ExitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  ThisMBB:
+  //   ...
+  //   fallthrough --> Loop1MBB
+  BB->addSuccessor(Loop1MBB);
+
+  // Loop1MBB:
+  //   ldxp DestLo, DestHi, [Ptr]
+  //   cmp DestLo, OldValLo
+  //   sbc xzr, DestHi, OldValHi
+  //   bne ExitMBB
+  BB = Loop1MBB;
+  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
+      .addReg(DestHi, RegState::Define)
+      .addReg(Ptr);
+  BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
+      OldValLo);
+  BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
+      OldValHi);
+
+  BuildMI(BB, DL, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(ExitMBB);
+  BB->addSuccessor(Loop2MBB);
+  BB->addSuccessor(ExitMBB);
+
+  // Loop2MBB:
+  //   stxp ScratchRes, NewValLo, NewValHi, [Ptr]
+  //   cbnz ScratchRes, Loop1MBB
+  BB = Loop2MBB;
+  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
+      .addReg(NewValLo)
+      .addReg(NewValHi)
+      .addReg(Ptr);
+  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(Loop1MBB);
+  BB->addSuccessor(Loop1MBB);
+  BB->addSuccessor(ExitMBB);
+
+  //  ExitMBB:
+  //   ...
+  BB = ExitMBB;
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *ARM64TargetLowering::EmitAtomicMinMax128(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned CondCode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned DestLo = MI->getOperand(0).getReg();
+  unsigned DestHi = MI->getOperand(1).getReg();
+  unsigned Ptr = MI->getOperand(2).getReg();
+  unsigned IncrLo = MI->getOperand(3).getReg();
+  unsigned IncrHi = MI->getOperand(4).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned LdrOpc, StrOpc;
+  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
+
+  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, LoopMBB);
+  MF->insert(It, ExitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  ExitMBB->splice(ExitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
+  unsigned ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+  unsigned ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+
+  //  ThisMBB:
+  //   ...
+  //   fallthrough --> LoopMBB
+  BB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   ldxp DestLo, DestHi, Ptr
+  //   cmp ScratchLo, DestLo, IncrLo
+  //   sbc xzr, ScratchHi, DestHi, IncrHi
+  //   csel ScratchLo, DestLo, IncrLo, <cmp-op>
+  //   csel ScratchHi, DestHi, IncrHi, <cmp-op>
+  //   stxp ScratchRes, ScratchLo, ScratchHi, ptr
+  //   cbnz ScratchRes, LoopMBB
+  //   fallthrough --> ExitMBB
+  BB = LoopMBB;
+  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
+      .addReg(DestHi, RegState::Define)
+      .addReg(Ptr);
+
+  BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
+      IncrLo);
+  BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
+      IncrHi);
+
+  BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchLo)
+      .addReg(DestLo)
+      .addReg(IncrLo)
+      .addImm(CondCode);
+  BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchHi)
+      .addReg(DestHi)
+      .addReg(IncrHi)
+      .addImm(CondCode);
+
+  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
+      .addReg(ScratchLo)
+      .addReg(ScratchHi)
+      .addReg(Ptr);
+  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
+
+  BB->addSuccessor(LoopMBB);
+  BB->addSuccessor(ExitMBB);
+
+  //  ExitMBB:
+  //   ...
+  BB = ExitMBB;
+
+  MI->eraseFromParent(); // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const {
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned IfTrueReg = MI->getOperand(1).getReg();
+  unsigned IfFalseReg = MI->getOperand(2).getReg();
+  unsigned CondCode = MI->getOperand(3).getImm();
+  bool CPSRKilled = MI->getOperand(4).isKill();
+
+  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrueBB);
+  MF->insert(It, EndBB);
+
+  // Transfer rest of current basic-block to EndBB
+  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  BuildMI(MBB, DL, TII->get(ARM64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(ARM64::B)).addMBB(EndBB);
+  MBB->addSuccessor(TrueBB);
+  MBB->addSuccessor(EndBB);
+
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
+  if (!CPSRKilled) {
+    TrueBB->addLiveIn(ARM64::CPSR);
+    EndBB->addLiveIn(ARM64::CPSR);
+  }
+
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
+
+  MI->eraseFromParent();
+  return EndBB;
+}
+
+MachineBasicBlock *
+ARM64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    MI->dump();
+#endif
+    assert(0 && "Unexpected instruction for custom inserter!");
+    break;
+
+  case ARM64::ATOMIC_LOAD_ADD_I8:
+    return EmitAtomicBinary(MI, BB, 1, ARM64::ADDWrr);
+  case ARM64::ATOMIC_LOAD_ADD_I16:
+    return EmitAtomicBinary(MI, BB, 2, ARM64::ADDWrr);
+  case ARM64::ATOMIC_LOAD_ADD_I32:
+    return EmitAtomicBinary(MI, BB, 4, ARM64::ADDWrr);
+  case ARM64::ATOMIC_LOAD_ADD_I64:
+    return EmitAtomicBinary(MI, BB, 8, ARM64::ADDXrr);
+  case ARM64::ATOMIC_LOAD_ADD_I128:
+    return EmitAtomicBinary128(MI, BB, ARM64::ADDSXrr, ARM64::ADCXr);
+
+  case ARM64::ATOMIC_LOAD_AND_I8:
+    return EmitAtomicBinary(MI, BB, 1, ARM64::ANDWrr);
+  case ARM64::ATOMIC_LOAD_AND_I16:
+    return EmitAtomicBinary(MI, BB, 2, ARM64::ANDWrr);
+  case ARM64::ATOMIC_LOAD_AND_I32:
+    return EmitAtomicBinary(MI, BB, 4, ARM64::ANDWrr);
+  case ARM64::ATOMIC_LOAD_AND_I64:
+    return EmitAtomicBinary(MI, BB, 8, ARM64::ANDXrr);
+  case ARM64::ATOMIC_LOAD_AND_I128:
+    return EmitAtomicBinary128(MI, BB, ARM64::ANDXrr, ARM64::ANDXrr);
+
+  case ARM64::ATOMIC_LOAD_OR_I8:
+    return EmitAtomicBinary(MI, BB, 1, ARM64::ORRWrr);
+  case ARM64::ATOMIC_LOAD_OR_I16:
+    return EmitAtomicBinary(MI, BB, 2, ARM64::ORRWrr);
+  case ARM64::ATOMIC_LOAD_OR_I32:
+    return EmitAtomicBinary(MI, BB, 4, ARM64::ORRWrr);
+  case ARM64::ATOMIC_LOAD_OR_I64:
+    return EmitAtomicBinary(MI, BB, 8, ARM64::ORRXrr);
+  case ARM64::ATOMIC_LOAD_OR_I128:
+    return EmitAtomicBinary128(MI, BB, ARM64::ORRXrr, ARM64::ORRXrr);
+
+  case ARM64::ATOMIC_LOAD_XOR_I8:
+    return EmitAtomicBinary(MI, BB, 1, ARM64::EORWrr);
+  case ARM64::ATOMIC_LOAD_XOR_I16:
+    return EmitAtomicBinary(MI, BB, 2, ARM64::EORWrr);
+  case ARM64::ATOMIC_LOAD_XOR_I32:
+    return EmitAtomicBinary(MI, BB, 4, ARM64::EORWrr);
+  case ARM64::ATOMIC_LOAD_XOR_I64:
+    return EmitAtomicBinary(MI, BB, 8, ARM64::EORXrr);
+  case ARM64::ATOMIC_LOAD_XOR_I128:
+    return EmitAtomicBinary128(MI, BB, ARM64::EORXrr, ARM64::EORXrr);
+
+  case ARM64::ATOMIC_LOAD_NAND_I8:
+    return EmitAtomicBinary(MI, BB, 1, ARM64::BICWrr);
+  case ARM64::ATOMIC_LOAD_NAND_I16:
+    return EmitAtomicBinary(MI, BB, 2, ARM64::BICWrr);
+  case ARM64::ATOMIC_LOAD_NAND_I32:
+    return EmitAtomicBinary(MI, BB, 4, ARM64::BICWrr);
+  case ARM64::ATOMIC_LOAD_NAND_I64:
+    return EmitAtomicBinary(MI, BB, 8, ARM64::BICXrr);
+  case ARM64::ATOMIC_LOAD_NAND_I128:
+    return EmitAtomicBinary128(MI, BB, ARM64::BICXrr, ARM64::BICXrr);
+
+  case ARM64::ATOMIC_LOAD_SUB_I8:
+    return EmitAtomicBinary(MI, BB, 1, ARM64::SUBWrr);
+  case ARM64::ATOMIC_LOAD_SUB_I16:
+    return EmitAtomicBinary(MI, BB, 2, ARM64::SUBWrr);
+  case ARM64::ATOMIC_LOAD_SUB_I32:
+    return EmitAtomicBinary(MI, BB, 4, ARM64::SUBWrr);
+  case ARM64::ATOMIC_LOAD_SUB_I64:
+    return EmitAtomicBinary(MI, BB, 8, ARM64::SUBXrr);
+  case ARM64::ATOMIC_LOAD_SUB_I128:
+    return EmitAtomicBinary128(MI, BB, ARM64::SUBSXrr, ARM64::SBCXr);
+
+  case ARM64::ATOMIC_LOAD_MIN_I128:
+    return EmitAtomicMinMax128(MI, BB, ARM64CC::LT);
+
+  case ARM64::ATOMIC_LOAD_MAX_I128:
+    return EmitAtomicMinMax128(MI, BB, ARM64CC::GT);
+
+  case ARM64::ATOMIC_LOAD_UMIN_I128:
+    return EmitAtomicMinMax128(MI, BB, ARM64CC::CC);
+
+  case ARM64::ATOMIC_LOAD_UMAX_I128:
+    return EmitAtomicMinMax128(MI, BB, ARM64CC::HI);
+
+  case ARM64::ATOMIC_SWAP_I8:
+    return EmitAtomicBinary(MI, BB, 1, 0);
+  case ARM64::ATOMIC_SWAP_I16:
+    return EmitAtomicBinary(MI, BB, 2, 0);
+  case ARM64::ATOMIC_SWAP_I32:
+    return EmitAtomicBinary(MI, BB, 4, 0);
+  case ARM64::ATOMIC_SWAP_I64:
+    return EmitAtomicBinary(MI, BB, 8, 0);
+  case ARM64::ATOMIC_SWAP_I128:
+    return EmitAtomicBinary128(MI, BB, 0, 0);
+
+  case ARM64::ATOMIC_CMP_SWAP_I8:
+    return EmitAtomicCmpSwap(MI, BB, 1);
+  case ARM64::ATOMIC_CMP_SWAP_I16:
+    return EmitAtomicCmpSwap(MI, BB, 2);
+  case ARM64::ATOMIC_CMP_SWAP_I32:
+    return EmitAtomicCmpSwap(MI, BB, 4);
+  case ARM64::ATOMIC_CMP_SWAP_I64:
+    return EmitAtomicCmpSwap(MI, BB, 8);
+  case ARM64::ATOMIC_CMP_SWAP_I128:
+    return EmitAtomicCmpSwap128(MI, BB);
+
+  case ARM64::F128CSEL:
+    return EmitF128CSEL(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+  }
+  llvm_unreachable("Unexpected instruction for custom inserter!");
+}
+
+//===----------------------------------------------------------------------===//
+// ARM64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToARM64CC - Convert a DAG integer condition code to an ARM64 CC
+static ARM64CC::CondCode changeIntCCToARM64CC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return ARM64CC::NE;
+  case ISD::SETEQ:
+    return ARM64CC::EQ;
+  case ISD::SETGT:
+    return ARM64CC::GT;
+  case ISD::SETGE:
+    return ARM64CC::GE;
+  case ISD::SETLT:
+    return ARM64CC::LT;
+  case ISD::SETLE:
+    return ARM64CC::LE;
+  case ISD::SETUGT:
+    return ARM64CC::HI;
+  case ISD::SETUGE:
+    return ARM64CC::CS;
+  case ISD::SETULT:
+    return ARM64CC::CC;
+  case ISD::SETULE:
+    return ARM64CC::LS;
+  }
+}
+
+/// changeFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC.
+static void changeFPCCToARM64CC(ISD::CondCode CC, ARM64CC::CondCode &CondCode,
+                                ARM64CC::CondCode &CondCode2) {
+  CondCode2 = ARM64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = ARM64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = ARM64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = ARM64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = ARM64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = ARM64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = ARM64CC::MI;
+    CondCode2 = ARM64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = ARM64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = ARM64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = ARM64CC::EQ;
+    CondCode2 = ARM64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = ARM64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = ARM64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = ARM64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = ARM64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = ARM64CC::NE;
+    break;
+  }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches ARM64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+static SDValue emitComparison(SDValue LHS, SDValue RHS, SDLoc dl,
+                              SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint())
+    return DAG.getNode(ARM64ISD::FCMP, dl, VT, LHS, RHS);
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  return DAG.getNode(ARM64ISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+      .getValue(1);
+}
+
+static SDValue getARM64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                           SDValue &ARM64cc, SelectionDAG &DAG, SDLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != 0x7fffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != 0xffffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      }
+    }
+  }
+
+  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+  ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
+  ARM64cc = DAG.getConstant(ARM64CC, MVT::i32);
+  return Cmp;
+}
+
+static std::pair<SDValue, SDValue>
+getARM64XALUOOp(ARM64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = ARM64ISD::ADDS;
+    CC = ARM64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = ARM64ISD::ADDS;
+    CC = ARM64CC::CS;
+    break;
+  case ISD::SSUBO:
+    Opc = ARM64ISD::SUBS;
+    CC = ARM64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = ARM64ISD::SUBS;
+    CC = ARM64CC::CC;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = ARM64CC::NE;
+    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, MVT::i64));
+      // On ARM64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 ARM64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                        UpperBits).getValue(1);
+      }
+      break;
+    }
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                      UpperBits).getValue(1);
+    }
+    break;
+  }
+  } // switch (...)
+
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+    // Emit the ARM64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
+  }
+  return std::make_pair(Value, Overflow);
+}
+
+SDValue ARM64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                           RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
+
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
+
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
+
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
+
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+  // The the values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
+
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
+  }
+
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, Other.getValueType()));
+
+    return DAG.getNode(ARM64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
+  }
+
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    assert(0 && "Invalid code");
+  case ISD::ADDC:
+    Opc = ARM64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = ARM64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = ARM64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = ARM64ISD::SBCS;
+    ExtraOp = true;
+    break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  ARM64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getARM64XALUOOp(CC, Op, DAG);
+
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
+
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+  Overflow = DAG.getNode(ARM64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, CCVal,
+                         Overflow);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  // The data thing is not used.
+  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
+
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(ARM64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
+
+SDValue ARM64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue ARM64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
+
+  // FP_TO_XINT conversion from the same type are legal.
+  if (VT.getSizeInBits() == InVT.getSizeInBits())
+    return Op;
+
+  if (InVT == MVT::v2f64) {
+    SDLoc dl(Op);
+    SDValue Cv = DAG.getNode(Op.getOpcode(), dl, MVT::v2i64, Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+  }
+
+  // Type changing conversions are illegal.
+  return SDValue();
+}
+
+SDValue ARM64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+
+  // v2i32 to v2f32 is legal.
+  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
+    return Op;
+
+  // This function only handles v2f64 outputs.
+  if (VT == MVT::v2f64) {
+    // Extend the input argument to a v2i64 that we can feed into the
+    // floating point conversion. Zero or sign extend based on whether
+    // we're doing a signed or unsigned float conversion.
+    unsigned Opc =
+        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
+    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
+  }
+
+  // Scalarize v2i64 to v2f32 conversions.
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
+                               DAG.getConstant(i, MVT::i64));
+    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
+    BuildVectorOps.push_back(Sclr);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &BuildVectorOps[0],
+                     BuildVectorOps.size());
+}
+
+SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
+
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue ARM64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::CallLoweringInfo CLI(
+      DAG.getEntryNode(), RetTy, false, false, false, false, 0,
+      CallingConv::Fast, /*isTaillCall=*/false,
+      /*doesNotRet=*/false, /*isReturnValueUsed*/ true, Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned ARM64TargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "ARM64GenCallingConv.inc"
+
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *ARM64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                   bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_ARM64_WebKit_JS;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    if (!Subtarget->isTargetDarwin())
+      return CC_ARM64_AAPCS;
+    return IsVarArg ? CC_ARM64_DarwinPCS_VarArg : CC_ARM64_DarwinPCS;
+  }
+}
+
+SDValue ARM64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+
+    // Get type of the original argument.
+    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+    MVT LocVT = ValVT;
+    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+      LocVT = MVT::i8;
+    else if (ActualMVT == MVT::i16)
+      LocVT = MVT::i16;
+
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+
+  SmallVector<SDValue, 16> ArgValues;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &ARM64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &ARM64::GPR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &ARM64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT == MVT::v1i64 ||
+               RegVT == MVT::v1f64 || RegVT == MVT::v2i32 ||
+               RegVT == MVT::v4i16 || RegVT == MVT::v8i8)
+        RC = &ARM64::FPR64RegClass;
+      else if (RegVT == MVT::v2i64 || RegVT == MVT::v4i32 ||
+               RegVT == MVT::v8i16 || RegVT == MVT::v16i8)
+        RC = &ARM64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
+                                   MachinePointerInfo::getFixedStack(FI), false,
+                                   false, false, 0));
+    }
+  }
+
+  // varargs
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
+
+    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+  }
+
+  return Chain;
+}
+
+void ARM64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                              SelectionDAG &DAG, SDLoc DL,
+                                              SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+
+  SmallVector<SDValue, 8> MemOps;
+
+  static const uint16_t GPRArgRegs[] = { ARM64::X0, ARM64::X1, ARM64::X2,
+                                         ARM64::X3, ARM64::X4, ARM64::X5,
+                                         ARM64::X6, ARM64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR =
+      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+
+  static const uint16_t FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
+                                         ARM64::Q3, ARM64::Q4, ARM64::Q5,
+                                         ARM64::Q6, ARM64::Q7 };
+  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+  unsigned FirstVariadicFPR =
+      CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &ARM64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+  }
+
+  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+  int FPRIdx = 0;
+  if (FPRSaveSize != 0) {
+    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::v2i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 16), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(16, getPointerTy()));
+    }
+  }
+
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
+  FuncInfo->setVarArgsFPRIndex(FPRIdx);
+  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
+
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
+                        MemOps.size());
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue ARM64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
+                                                         : RetCC_ARM64_AAPCS;
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+bool ARM64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
+  if (isVarArg && !Outs.empty())
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // Note that currently ARM64 "C" calling convention and "Fast" calling
+  // convention are compatible. If/when that ever changes, we'll need to
+  // add checks here to make sure any interactions are OK.
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+    CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
+    CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+    if (CCInfo.getNextStackOffset()) {
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // Just don't handle anything that needs custom adjustments for now.
+          // If need be, we can revisit later, but we shouldn't ever end up
+          // here.
+          return false;
+        } else if (!VA.isRegLoc()) {
+          // Likewise, don't try to handle stack based arguments for the
+          // time being.
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsThisReturn = false;
+
+  // If tail calls are explicitly disabled, make sure not to use them.
+  if (!EnableARM64TailCalls)
+    IsTailCall = false;
+
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, IsStructRet,
+        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+    // We don't support GuaranteedTailCallOpt, only automatically
+    // detected sibcalls.
+    // FIXME: Re-evaluate. Is this true? Should it be true?
+    if (IsTailCall)
+      ++NumTailCalls;
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CLI.Args[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      MVT LocVT = ValVT;
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        LocVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        LocVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsTailCall)
+    Chain =
+        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ARM64::SP, getPointerTy());
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      // There's no reason we can't support stack args w/ tailcall, but
+      // we currently don't, so assert if we see one.
+      assert(!IsTailCall && "stack argument with tail call!?");
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+      // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+      // promoted to a legal register type i32, we should truncate Arg back to
+      // i1/i8/i16.
+      if (Arg.getValueType().isSimple() &&
+          Arg.getValueType().getSimpleVT() == MVT::i32 &&
+          (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
+           VA.getLocVT() == MVT::i16))
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+
+      SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
+                                   MachinePointerInfo::getStack(LocMemOffset),
+                                   false, false, 0);
+      MemOpChains.push_back(Store);
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOpChains[0],
+                        MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+      else {
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+                                            ARM64II::MO_GOT);
+        Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), ARM64II::MO_GOT);
+      Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
+
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall)
+    return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, &Ops[0], Ops.size());
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
+
+bool ARM64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
+                                                         : RetCC_ARM64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
+}
+
+SDValue
+ARM64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 SDLoc DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
+                                                         : RetCC_ARM64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
+
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, &RetOps[0],
+                     RetOps.size());
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue ARM64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
+
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & ARM64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
+  }
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = ARM64II::MO_NC;
+    return DAG.getNode(
+        ARM64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | ARM64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
+SDValue
+ARM64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TLVPAddr);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+                  false, true, true, 8);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal ARM64 call node: x0 takes the address of the descriptor, and returns
+  // the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, ARM64::X0, DescAddr, SDValue());
+  Chain = DAG.getNode(ARM64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                      Chain, FuncTLVGet, DAG.getRegister(ARM64::X0, MVT::i64),
+                      DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Chain.getValue(1));
+}
+
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, CPSR) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+///     adrp x0, :tlsdesc:thread_var
+///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
+///     add x0, x0, :tlsdesc_lo12:thread_var
+///     .tlsdesccall thread_var
+///     blr x8
+///     (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue ARM64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+                                                 SDValue DescAddr, SDLoc DL,
+                                                 SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+
+  // The function we need to call is simply the first entry in the GOT for this
+  // descriptor, load it in preparation.
+  SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // The function takes only one argument: the address of the descriptor itself
+  // in X0.
+  SDValue Glue, Chain;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM64::X0, DescAddr, Glue);
+  Glue = Chain.getValue(1);
+
+  // We're now ready to populate the argument list, as with a normal call:
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Func);
+  Ops.push_back(SymAddr);
+  Ops.push_back(DAG.getRegister(ARM64::X0, PtrVT));
+  Ops.push_back(DAG.getRegisterMask(Mask));
+  Ops.push_back(Glue);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, &Ops[0], Ops.size());
+  Glue = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue);
+}
+
+SDValue ARM64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  SDValue TPOff;
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  SDValue ThreadBase = DAG.getNode(ARM64ISD::THREAD_POINTER, DL, PtrVT);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(16, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
+    TPOff = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TPOff);
+  } else if (Model == TLSModel::LocalDynamic) {
+    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+    // the beginning of the module's TLS region, followed by a DTPREL offset
+    // calculation.
+
+    // These accesses will need deduplicating if there's more than one.
+    ARM64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
+    MFI->incNumLocalDynamicTLSAccesses();
+
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT, ARM64II::MO_TLS | ARM64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT,
+        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  ARM64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
+
+    SDValue DTPOff =
+        SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
+                                   DAG.getTargetConstant(16, MVT::i32)),
+                0);
+    DTPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+                                        DAG.getTargetConstant(0, MVT::i32)),
+                     0);
+
+    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  } else
+    llvm_unreachable("Unsupported ELF TLS access model");
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue ARM64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue ARM64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isOne() &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    ARM64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(ARM64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+                       CCVal, Overflow);
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(ARM64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(ARM64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBNZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(ARM64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(ARM64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+  ARM64CC::CondCode CC1, CC2;
+  changeFPCCToARM64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != ARM64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
+
+  return BR1;
+}
+
+SDValue ARM64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+  if (SrcVT != VT) {
+    if (SrcVT == MVT::f32 && VT == MVT::f64)
+      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+    else if (SrcVT == MVT::f64 && VT == MVT::f32)
+      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+    else
+      // FIXME: Src type is different, bail out for now. Can VT really be a
+      // vector type?
+      return SDValue();
+  }
+
+  EVT VecVT;
+  EVT EltVT;
+  SDValue EltMask, VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = MVT::v4i32;
+    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = DAG.getConstant(0, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
+  }
+
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+    BuildVectorOps.push_back(EltMask);
+
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT,
+                                 &BuildVectorOps[0], BuildVectorOps.size());
+
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+  }
+
+  SDValue Sel =
+      DAG.getNode(ARM64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
+
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(ARM64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(ARM64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
+
+SDValue ARM64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+    return SDValue();
+
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
+
+  SDValue VecVal;
+  if (VT == MVT::i32) {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    VecVal =
+        DAG.getTargetInsertSubreg(ARM64::ssub, DL, MVT::v8i8, ZeroVec, VecVal);
+  } else {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+  }
+
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::arm64_neon_uaddlv, MVT::i32), CtPop);
+
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
+
+SDValue ARM64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
+  EVT VT = Op.getValueType();
+  SDValue TVal = DAG.getConstant(1, VT);
+  SDValue FVal = DAG.getConstant(0, VT);
+
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (RHS.getNode() == 0) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue CCVal;
+    SDValue Cmp =
+        getARM64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+
+  ARM64CC::CondCode CC1, CC2;
+  changeFPCCToARM64CC(CC, CC1, CC2);
+  if (CC2 == ARM64CC::AL) {
+    changeFPCCToARM64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
+    // clean.  Some of them require two CSELs to implement.  As is in this case,
+    // we emit the first CSEL and then emit a second using the output of the
+    // first as the RHS.  We're effectively OR'ing the two CC's together.
+
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+    SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+}
+
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+  if (Cmp == Result)
+    return true;
+
+  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+      Result.getValueType() == MVT::f64) {
+    bool Lossy;
+    APFloat CmpVal = CCmp->getValueAPF();
+    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
+  }
+
+  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
+}
+
+SDValue ARM64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue CC = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  unsigned Opc = CC.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CC.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+      return SDValue();
+
+    ARM64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, CC.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(ARM64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal,
+                       Overflow);
+  }
+
+  if (CC.getOpcode() == ISD::SETCC)
+    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
+  else
+    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+                           FVal, ISD::SETNE);
+}
+
+SDValue ARM64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc dl(Op);
+
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Handle integers first.
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = ARM64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+      if (CVal && CVal->isAllOnesValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+      if (CVal && CVal->isNullValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
+
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = ARM64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = ARM64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = ARM64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = ARM64ISD::CSINC;
+
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
+
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      if (Opcode != ARM64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    EVT VT = Op.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = Op.getValueType();
+
+  // Try to match this select into a max/min operation, which have dedicated
+  // opcode in the instruction set.
+  // NOTE: This is not correct in the presence of NaNs, so we only enable this
+  // in no-NaNs mode.
+  if (getTargetMachine().Options.NoNaNsFPMath) {
+    if (selectCCOpsAreFMaxCompatible(LHS, FVal) &&
+        selectCCOpsAreFMaxCompatible(RHS, TVal)) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(TVal, FVal);
+    }
+
+    if (selectCCOpsAreFMaxCompatible(LHS, TVal) &&
+        selectCCOpsAreFMaxCompatible(RHS, FVal)) {
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+        return DAG.getNode(ARM64ISD::FMAX, dl, VT, TVal, FVal);
+        break;
+      case ISD::SETLT:
+      case ISD::SETLE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        return DAG.getNode(ARM64ISD::FMIN, dl, VT, TVal, FVal);
+        break;
+      }
+    }
+  }
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  ARM64CC::CondCode CC1, CC2;
+  changeFPCCToARM64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != ARM64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
+}
+
+SDValue ARM64TargetLowering::LowerJumpTable(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+  SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+}
+
+SDValue ARM64TargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          ARM64II::MO_GOT);
+      return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
+
+    const unsigned char MO_NC = ARM64II::MO_NC;
+    return DAG.getNode(
+        ARM64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), ARM64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), ARM64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), ARM64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), ARM64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), ARM64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue ARM64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = ARM64II::MO_NC;
+    return DAG.getNode(
+        ARM64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGEOFF |
+                                                             ARM64II::MO_NC);
+    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue ARM64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ARM64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue ARM64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // The layout of the va_list struct is specified in the AArch64 Procedure Call
+  // Standard, section B.3.
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
+  SDLoc DL(Op);
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAList = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SmallVector<SDValue, 4> MemOps;
+
+  // void *__stack at offset 0
+  SDValue Stack =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+                                MachinePointerInfo(SV), false, false, 8));
+
+  // void *__gr_top at offset 8
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
+  if (GPRSize > 0) {
+    SDValue GRTop, GRTopAddr;
+
+    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(8, getPointerTy()));
+
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
+    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
+                        DAG.getConstant(GPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+                                  MachinePointerInfo(SV, 8), false, false, 8));
+  }
+
+  // void *__vr_top at offset 16
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
+  if (FPRSize > 0) {
+    SDValue VRTop, VRTopAddr;
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(16, getPointerTy()));
+
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
+    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
+                        DAG.getConstant(FPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+                                  MachinePointerInfo(SV, 16), false, false, 8));
+  }
+
+  // int __gr_offs at offset 24
+  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(24, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
+                                GROffsAddr, MachinePointerInfo(SV, 24), false,
+                                false, 4));
+
+  // int __vr_offs at offset 28
+  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(28, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
+                                VROffsAddr, MachinePointerInfo(SV, 28), false,
+                                false, 4));
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
+                     MemOps.size());
+}
+
+SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
+}
+
+SDValue ARM64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+                       8, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
+}
+
+SDValue ARM64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
+
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+
+  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+                               MachinePointerInfo(V), false, false, false, 0);
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                         DAG.getConstant(Align - 1, getPointerTy()));
+    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
+  }
+
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                               DAG.getConstant(ArgSize, getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+                                 false, false, 0);
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+                                 MachinePointerInfo(), false, false, false, 0);
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, 2, DL);
+  }
+
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+                     false, false, 0);
+}
+
+SDValue ARM64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ARM64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(), false, false, false, 0);
+  return FrameAddr;
+}
+
+SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, getPointerTy());
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(ARM64::LR, &ARM64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue ARM64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  SDValue Cmp =
+      emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
+  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
+
+  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo =
+      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  // ARM64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue TrueValHi = Opc == ISD::SRA
+                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                        DAG.getConstant(VTBits - 1, MVT::i64))
+                          : DAG.getConstant(0, VT);
+  SDValue Hi =
+      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue ARM64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+  SDValue Cmp =
+      emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
+  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
+  SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+
+  // ARM64 shifts of larger than register sizes are wrapped rather than clamped,
+  // so we can't just emit "lo << a" if a is too big.
+  SDValue TrueValLo = DAG.getConstant(0, VT);
+  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo =
+      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+bool
+ARM64TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The ARM64 target doesn't support folding offsets into global addresses.
+  return false;
+}
+
+bool ARM64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
+
+  if (VT == MVT::f64)
+    return ARM64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return ARM64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                          ARM64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                          ARM64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARM64TargetLowering::ConstraintType
+ARM64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARM64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+ARM64TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &ARM64::GPR64commonRegClass);
+      return std::make_pair(0U, &ARM64::GPR32commonRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &ARM64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &ARM64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &ARM64::FPR128RegClass);
+      break;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &ARM64::FPR128_loRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(ARM64::CPSR), &ARM64::CCRRegClass);
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (Res.second == 0) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      const std::string Reg =
+          std::string(&Constraint[2], &Constraint[Size - 1]);
+      int RegNo = atoi(Reg.c_str());
+      if (RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        Res.first = ARM64::FPR128RegClass.getRegister(RegNo);
+        Res.second = &ARM64::FPR128RegClass;
+      }
+    }
+  }
+
+  return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void ARM64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
+
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C || C->getZExtValue() != 0)
+      return;
+
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(ARM64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(ARM64::WZR, MVT::i32);
+    break;
+  }
+
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+        break;
+      return;
+    }
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (ARM64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (ARM64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (ARM64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
+    }
+    case 'N': {
+      if (ARM64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
+    }
+    default:
+      return;
+    }
+
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, MVT::i64);
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                     ARM64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
+
+  return DAG.getTargetExtractSubreg(ARM64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARM64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    }
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = { 0, 0 };
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // It probably isn't worth padding out a smaller vector just to
+      // break it down again in a shuffle.
+      return SDValue();
+    }
+
+    // Don't attempt to extract subvectors from BUILD_VECTOR sources
+    // that expand or trunc the original value.
+    // TODO: We can try to bitcast and ANY_EXTEND the result but
+    // we need to consider the cost of vector ANY_EXTEND, and the
+    // legality of all the types.
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType())
+      return SDValue();
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i], DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+      ShuffleSrcs[i] = DAG.getNode(ARM64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(Imm, MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt =
+        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+    }
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ReverseEXT = false;
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, it may still be
+    // a VEXT but the source vectors must be swapped.
+    ExpectedElt += 1;
+    if (ExpectedElt == NumElts * 2) {
+      ExpectedElt = 0;
+      ReverseEXT = true;
+    }
+
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  // Adjust the index value if the source operands will be swapped.
+  if (ReverseEXT)
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  return true;
+}
+
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default:
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(ARM64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(ARM64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(ARM64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = ARM64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16)
+      Opcode = ARM64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = ARM64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = ARM64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
+
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(ARM64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(ARM64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(ARM64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(ARM64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(ARM64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(ARM64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(ARM64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
+  }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
+    }
+  }
+
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueType().getSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
+
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+  SDValue Shuffle;
+  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
+  } else {
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(ARM64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+      //                               &TBLMask[0], IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
+    }
+  }
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return ARM64ISD::DUPLANE8;
+  if (EltType == MVT::i16)
+    return ARM64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return ARM64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return ARM64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue ARM64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+                                       V1.getValueType().getSimpleVT())) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(ARM64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(ARM64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
+  }
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(ARM64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(ARM64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(ARM64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  } else if (V2->getOpcode() == ISD::UNDEF &&
+             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+SDValue ARM64TargetLowering::LowerVectorAND(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
+  return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: ARM64ISD::VSHL vector, #shift
+  // or ARM64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != ARM64ISD::VSHL && ShiftOpc != ARM64ISD::VLSHR))
+    return SDValue();
+  bool IsShiftRight = ShiftOpc == ARM64ISD::VLSHR;
+
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
+    return SDValue();
+
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    return SDValue();
+
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
+    return SDValue();
+
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::arm64_neon_vsri : Intrinsic::arm64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+  DEBUG(dbgs() << "arm64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
+}
+
+SDValue ARM64TargetLowering::LowerVectorOR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableARM64SlrGeneration) {
+    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  }
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+SDValue ARM64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (ARM64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, MVT::i32));
+          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(ARM64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (ARM64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool usesOnlyOneConstantValue = true;
+  bool isConstant = true;
+  unsigned NumConstantLanes = 0;
+  SDValue Value;
+  SDValue ConstantValue;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (isa<ConstantSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
+
+    if (!Value.getNode())
+      Value = V;
+    else if (V != Value)
+      usesOnlyOneValue = false;
+  }
+
+  if (!Value.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
+    if (!isConstant) {
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(ARM64ISD::DUP, dl, VT, Value);
+
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueType().getSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      MVT NewType =
+          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(ARM64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      if (!isa<ConstantSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
+    }
+    return Val;
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? ARM64::ssub : ARM64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  // Just use the default expansion. We failed to find a better alternative.
+  return SDValue();
+}
+
+SDValue ARM64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(2)))
+    return SDValue();
+
+  EVT VT = Op.getOperand(0).getValueType();
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
+}
+
+SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  EVT VT = Op.getOperand(0).getValueType();
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
+
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
+}
+
+SDValue ARM64TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::SCALAR_TO_VECTOR && "Unknown opcode!");
+  // Some AdvSIMD intrinsics leave their results in the scalar B/H/S/D
+  // registers. The default lowering will copy those to a GPR then back
+  // to a vector register. Instead, just recognize those cases and reference
+  // the vector register they're already a subreg of.
+  SDValue Op0 = Op->getOperand(0);
+  if (Op0->getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+    return Op;
+  unsigned IID = getIntrinsicID(Op0.getNode());
+  // The below list of intrinsics isn't exhaustive. Add cases as-needed.
+  // FIXME: Even better would be if there were an attribute on the node
+  // that we could query and set in the intrinsics definition or something.
+  unsigned SubIdx;
+  switch (IID) {
+  default:
+    // Early exit if this isn't one of the intrinsics we handle.
+    return Op;
+  case Intrinsic::arm64_neon_uaddv:
+  case Intrinsic::arm64_neon_saddv:
+  case Intrinsic::arm64_neon_uaddlv:
+  case Intrinsic::arm64_neon_saddlv:
+    switch (Op0.getValueType().getSizeInBits()) {
+    default:
+      llvm_unreachable("Illegal result size from ARM64 vector intrinsic!");
+    case 8:
+      SubIdx = ARM64::bsub;
+      break;
+    case 16:
+      SubIdx = ARM64::hsub;
+      break;
+    case 32:
+      SubIdx = ARM64::ssub;
+      break;
+    case 64:
+      SubIdx = ARM64::dsub;
+      break;
+    }
+  }
+  MachineSDNode *N =
+      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(Op),
+                         Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
+                         Op0, DAG.getTargetConstant(SubIdx, MVT::i32));
+  return SDValue(N, 0);
+}
+
+SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  // Just in case...
+  if (!VT.isVector())
+    return SDValue();
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueType().getSizeInBits();
+  if (Val == 0) {
+    switch (Size) {
+    case 8:
+      return DAG.getTargetExtractSubreg(ARM64::bsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 16:
+      return DAG.getTargetExtractSubreg(ARM64::hsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 32:
+      return DAG.getTargetExtractSubreg(ARM64::ssub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 64:
+      return DAG.getTargetExtractSubreg(ARM64::dsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    default:
+      llvm_unreachable("Unexpected vector type in extract_subvector!");
+    }
+  }
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+bool ARM64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                             EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm, WhichResult;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, ReverseVEXT, Imm) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, WhichResult) || isUZPMask(M, VT, WhichResult) ||
+          isZIPMask(M, VT, WhichResult) ||
+          isTRN_v_undef_Mask(M, VT, WhichResult) ||
+          isUZP_v_undef_Mask(M, VT, WhichResult) ||
+          isZIP_v_undef_Mask(M, VT, WhichResult));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue ARM64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
+
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(ARM64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::arm64_neon_ushl, MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+        Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? ARM64ISD::VASHR : ARM64ISD::VLSHR;
+      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::arm64_neon_sshl
+                                                : Intrinsic::arm64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(ARM64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+    return NegShiftLeft;
+  }
+
+  return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    ARM64CC::CondCode CC, bool NoNans, EVT VT,
+                                    SDLoc dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
+    default:
+      return SDValue();
+    case ARM64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(ARM64ISD::NOT, dl, VT, Fcmeq);
+    }
+    case ARM64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case ARM64CC::GE:
+      if (IsZero)
+        return DAG.getNode(ARM64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, LHS, RHS);
+    case ARM64CC::GT:
+      if (IsZero)
+        return DAG.getNode(ARM64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, LHS, RHS);
+    case ARM64CC::LS:
+      if (IsZero)
+        return DAG.getNode(ARM64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, RHS, LHS);
+    case ARM64CC::LT:
+      if (!NoNans)
+        return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    // Fallthrough.
+    case ARM64CC::MI:
+      if (IsZero)
+        return DAG.getNode(ARM64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, RHS, LHS);
+    }
+  }
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case ARM64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(ARM64ISD::NOT, dl, VT, Cmeq);
+  }
+  case ARM64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
+  case ARM64CC::GE:
+    if (IsZero)
+      return DAG.getNode(ARM64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(ARM64ISD::CMGE, dl, VT, LHS, RHS);
+  case ARM64CC::GT:
+    if (IsZero)
+      return DAG.getNode(ARM64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(ARM64ISD::CMGT, dl, VT, LHS, RHS);
+  case ARM64CC::LE:
+    if (IsZero)
+      return DAG.getNode(ARM64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS);
+  case ARM64CC::LS:
+    return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS);
+  case ARM64CC::CC:
+    return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS);
+  case ARM64CC::LT:
+    if (IsZero)
+      return DAG.getNode(ARM64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS);
+  case ARM64CC::HI:
+    return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS);
+  case ARM64CC::CS:
+    return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS);
+  }
+}
+
+SDValue ARM64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
+    return EmitVectorComparison(LHS, RHS, ARM64CC, false, Op.getValueType(), dl,
+                                DAG);
+  }
+
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  ARM64CC::CondCode CC1, CC2;
+  changeFPCCToARM64CC(CC, CC1, CC2);
+
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp1 =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+  if (!Cmp1.getNode())
+    return SDValue();
+
+  if (CC2 != ARM64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    return DAG.getNode(ISD::OR, dl, Cmp1.getValueType(), Cmp1, Cmp2);
+  }
+
+  return Cmp1;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                             const CallInst &I,
+                                             unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm64_neon_ld2:
+  case Intrinsic::arm64_neon_ld3:
+  case Intrinsic::arm64_neon_ld4:
+  case Intrinsic::arm64_neon_ld2lane:
+  case Intrinsic::arm64_neon_ld3lane:
+  case Intrinsic::arm64_neon_ld4lane:
+  case Intrinsic::arm64_neon_ld2r:
+  case Intrinsic::arm64_neon_ld3r:
+  case Intrinsic::arm64_neon_ld4r: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm64_neon_st2:
+  case Intrinsic::arm64_neon_st3:
+  case Intrinsic::arm64_neon_st4:
+  case Intrinsic::arm64_neon_st2lane:
+  case Intrinsic::arm64_neon_st3lane:
+  case Intrinsic::arm64_neon_st4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool ARM64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+bool ARM64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool ARM64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+bool ARM64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+
+bool ARM64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2)) {
+    return true;
+  }
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
+          VT2.isInteger() && VT1.getSizeInBits() <= 32);
+}
+
+bool ARM64TargetLowering::hasPairedLoad(Type *LoadedType,
+                                        unsigned &RequiredAligment) const {
+  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+bool ARM64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                        unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT ARM64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                             unsigned SrcAlign, bool IsMemset,
+                                             bool ZeroMemset, bool MemcpyStrSrc,
+                                             MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (!IsMemset && Size >= 16 &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsUnalignedMemoryAccesses(MVT::v2i64, 0, &Fast) && Fast)))
+    return MVT::v2i64;
+
+  return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool ARM64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+    return true;
+  return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool ARM64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  if (Immed < 0)
+    Immed *= -1;
+  return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARM64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                Type *Ty) const {
+  // ARM64 has five basic addressing modes:
+  //  reg
+  //  reg + 9-bit signed offset
+  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
+  //  reg1 + reg2
+  //  reg + SIZE_IN_BYTES * reg
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // No reg+reg+imm addressing.
+  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+    return false;
+
+  // check reg + imm case:
+  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+  uint64_t NumBytes = 0;
+  if (Ty->isSized()) {
+    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    NumBytes = NumBits / 8;
+    if (!isPowerOf2_64(NumBits))
+      NumBytes = 0;
+  }
+
+  if (!AM.Scale) {
+    int64_t Offset = AM.BaseOffs;
+
+    // 9-bit signed offset
+    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+      return true;
+
+    // 12-bit unsigned offset
+    unsigned shift = Log2_64(NumBytes);
+    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+        // Must be a multiple of NumBytes (NumBytes is a power of 2)
+        (Offset >> shift) << shift == Offset)
+      return true;
+    return false;
+  }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+  if (!AM.Scale || AM.Scale == 1 ||
+      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
+    return true;
+  return false;
+}
+
+int ARM64TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                              Type *Ty) const {
+  // Scaling factors are not free at all.
+  // Operands                     | Rt Latency
+  // -------------------------------------------
+  // Rt, [Xn, Xm]                 | 4
+  // -------------------------------------------
+  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
+  // Rt, [Xn, Wm, <extend> #imm]  |
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1 if
+    // it is not equal to 0 or 1.
+    return AM.Scale != 0 && AM.Scale != 1;
+  return -1;
+}
+
+bool ARM64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const uint16_t *
+ARM64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const uint16_t ScratchRegs[] = {
+    ARM64::X16, ARM64::X17, ARM64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                            Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(ARM64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, VT));
+        return DAG.getNode(ARM64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(ARM64CC::PL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARM64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  return performIntegerAbsCombine(N, DAG);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARM64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    APInt Value = C->getAPIntValue();
+    EVT VT = N->getValueType(0);
+    APInt VP1 = Value + 1;
+    if (VP1.isPowerOf2()) {
+      // Multiplying by one less than a power of two, replace with a shift
+      // and a subtract.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VP1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+    APInt VM1 = Value - 1;
+    if (VM1.isPowerOf2()) {
+      // Multiplying by one more than a power of two, replace with a shift
+      // and an add.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VM1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  SDValue N0 = N->getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->isVolatile(),
+                               LN0->isNonTemporal(), LN0->isInvariant(),
+                               LN0->getAlignment());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? ARM64ISD::SITOF : ARM64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(ARM64ISD::EXTR, DL, VT, LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const ARM64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  if (!EnableARM64ExtrGeneration)
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
+
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+  if (VT.getSimpleVT().getSizeInBits() != 64)
+    return SDValue();
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG) {
+    if (idx != ARM64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
+  }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
+
+  DEBUG(dbgs() << "arm64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(ARM64::dsub, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+    assert(VT.getVectorElementType().getSizeInBits() == 64);
+    return DAG.getNode(ARM64ISD::DUPLANE64, dl, VT,
+                       WidenVector(N->getOperand(0), DAG),
+                       DAG.getConstant(0, MVT::i64));
+  }
+
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue RHS = Op1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "arm64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(
+      ISD::BITCAST, dl, VT,
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueType().getSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      assert(0 && "unexpected vector type!");
+
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  // We can handle most types of duplicate, but the lane ones have an extra
+  // operand saying *which* lane, so we need to know.
+  bool IsDUPLANE;
+  switch (N.getOpcode()) {
+  case ARM64ISD::DUP:
+    IsDUPLANE = false;
+    break;
+  case ARM64ISD::DUPLANE8:
+  case ARM64ISD::DUPLANE16:
+  case ARM64ISD::DUPLANE32:
+  case ARM64ISD::DUPLANE64:
+    IsDUPLANE = true;
+    break;
+  default:
+    return SDValue();
+  }
+
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
+    return SDValue();
+
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+  SDValue NewDUP;
+  if (IsDUPLANE)
+    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+                         N.getOperand(1));
+  else
+    NewDUP = DAG.getNode(ARM64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
+
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into ARM64 code.
+struct ARM64SetCCInfo {
+  const SDValue *Cmp;
+  ARM64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  ARM64SetCCInfo ARM64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.
+/// If set to true, IsARM64 field, Info is a ARM64SetCCInfo, otherwise Info is
+/// a GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsARM64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// ARM64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsARM64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != ARM64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.ARM64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsARM64 = true;
+  SetCCInfo.Info.ARM64.CC = static_cast<ARM64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
+    return false;
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.ARM64.CC =
+        ARM64CC::getInvertedCondCode(SetCCInfo.Info.ARM64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
+}
+
+// The folding we want to perform is:
+// (add x, (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
+
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsARM64
+                  ? InfoAndKind.Info.ARM64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsARM64) {
+    CCVal = DAG.getConstant(
+        ARM64CC::getInvertedCondCode(InfoAndKind.Info.ARM64.CC), MVT::i32);
+    Cmp = *InfoAndKind.Info.ARM64.Cmp;
+  } else
+    Cmp = getARM64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+  return DAG.getNode(ARM64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
+  }
+
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
+    return SDValue();
+
+  unsigned ExtType = LHS.getOpcode();
+
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
+
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
+
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (arm64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (arm64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::arm64_neon_sqshl:
+    Opcode = ARM64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::arm64_neon_uqshl:
+    Opcode = ARM64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::arm64_neon_srshl:
+    Opcode = ARM64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::arm64_neon_urshl:
+    Opcode = ARM64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::arm64_neon_sqshlu:
+    Opcode = ARM64ISD::SQSHLU_I;
+    IsRightShift = false;
+    break;
+  }
+
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, MVT::i32));
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, MVT::i32));
+
+  return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const ARM64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::arm64_neon_vcvtfxs2fp:
+  case Intrinsic::arm64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+    break;
+  case Intrinsic::arm64_neon_fmax:
+    return DAG.getNode(ARM64ISD::FMAX, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::arm64_neon_fmin:
+    return DAG.getNode(ARM64ISD::FMIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::arm64_neon_smull:
+  case Intrinsic::arm64_neon_umull:
+  case Intrinsic::arm64_neon_pmull:
+  case Intrinsic::arm64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::arm64_neon_sqshl:
+  case Intrinsic::arm64_neon_uqshl:
+  case Intrinsic::arm64_neon_sqshlu:
+  case Intrinsic::arm64_neon_srshl:
+  case Intrinsic::arm64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::arm64_crc32b:
+  case Intrinsic::arm64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::arm64_crc32h:
+  case Intrinsic::arm64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::arm64_neon_sabd ||
+        IID == Intrinsic::arm64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for ARM64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For ARM64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  if (!ResVT.isSimple())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src->getValueType(0).getSimpleVT();
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // Check for insert vector elements.
+  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+  SDValue SplatVal = StVal.getOperand(1);
+  unsigned RemainInsertElts = NumVecElts - 1;
+
+  // Check that this is a splat.
+  while (--RemainInsertElts) {
+    SDValue NextInsertElt = StVal.getOperand(0);
+    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+    if (NextInsertElt.getOperand(1) != SplatVal)
+      return SDValue();
+    StVal = NextInsertElt;
+  }
+  unsigned OrigAlignment = St->getAlignment();
+  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(St);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), Alignment);
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const ARM64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  // Cyclone has bad performance on unaligned 16B stores when crossing line and
+  // page boundries. We want to split such stores.
+  if (!Subtarget->isCyclone())
+    return SDValue();
+
+  // Don't split at Oz.
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (IsMinSize)
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+  if (ReplacedSplat != SDValue())
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(0));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(NumElts));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+                      S->getAlignment());
+}
+
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != ARM64CC::EQ && CC != ARM64CC::NE)
+    return SDValue();
+
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != ARM64ISD::ADDS && CmpOpc != ARM64ISD::SUBS)
+    return SDValue();
+
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
+
+  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+    std::swap(LHS, RHS);
+
+  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+    return SDValue();
+
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
+    return SDValue();
+
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == ARM64CC::EQ)
+    BR = DAG.getNode(ARM64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+  else
+    BR = DAG.getNode(ARM64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
+
+  return SDValue();
+}
+
+SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case ARM64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  }
+  return SDValue();
+}
+
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != ARM64ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!EnableARM64TailCalls)
+    return false;
+
+  if (!CI->isTailCall())
+    return false;
+
+  return true;
+}
+
+bool ARM64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                 SDValue &Offset,
+                                                 ISD::MemIndexedMode &AM,
+                                                 bool &IsInc,
+                                                 SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = (int64_t)RHS->getZExtValue();
+    if (RHSC >= 256 || RHSC <= -256)
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
+  }
+  return false;
+}
+
+bool ARM64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                    SDValue &Offset,
+                                                    ISD::MemIndexedMode &AM,
+                                                    SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+    return false;
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+bool ARM64TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                     SDValue &Base,
+                                                     SDValue &Offset,
+                                                     ISD::MemIndexedMode &AM,
+                                                     SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+/// The only 128-bit atomic operation is an stxp that succeeds. In particular
+/// neither ldp nor ldxp are atomic. So the canonical sequence for an atomic
+/// load is:
+///     loop:
+///         ldxp x0, x1, [x8]
+///         stxp w2, x0, x1, [x8]
+///         cbnz w2, loop
+/// If the stxp succeeds then the ldxp managed to get both halves without an
+/// intervening stxp from a different thread and the read was atomic.
+static void ReplaceATOMIC_LOAD_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                   SelectionDAG &DAG) {
+  SDLoc DL(N);
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  EVT VT = AN->getMemoryVT();
+  SDValue Zero = DAG.getConstant(0, VT);
+
+  // FIXME: Really want ATOMIC_LOAD_NOP but that doesn't fit into the existing
+  // scheme very well. Given the complexity of what we're already generating, an
+  // extra couple of ORRs probably won't make much difference.
+  SDValue Result = DAG.getAtomic(ISD::ATOMIC_LOAD_OR, DL, AN->getMemoryVT(),
+                                 N->getOperand(0), N->getOperand(1), Zero,
+                                 AN->getMemOperand(), AN->getOrdering(),
+                                 AN->getSynchScope());
+
+  Results.push_back(Result.getValue(0)); // Value
+  Results.push_back(Result.getValue(1)); // Chain
+}
+
+static void ReplaceATOMIC_OP_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 SelectionDAG &DAG, unsigned NewOp) {
+  SDLoc DL(N);
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(N->getValueType(0) == MVT::i128 &&
+         "Only know how to expand i128 atomics");
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Ptr
+  // Low part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                            N->getOperand(2), DAG.getIntPtrConstant(0)));
+  // High part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                            N->getOperand(2), DAG.getIntPtrConstant(1)));
+  if (NewOp == ARM64::ATOMIC_CMP_SWAP_I128) {
+    // Low part of Val2
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                              N->getOperand(3), DAG.getIntPtrConstant(0)));
+    // High part of Val2
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                              N->getOperand(3), DAG.getIntPtrConstant(1)));
+  }
+
+  Ops.push_back(DAG.getTargetConstant(Ordering, MVT::i32));
+  Ops.push_back(N->getOperand(0)); // Chain
+
+  SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
+  SDNode *Result = DAG.getMachineNode(NewOp, DL, Tys, Ops);
+  SDValue OpsF[] = { SDValue(Result, 0), SDValue(Result, 1) };
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, OpsF, 2));
+  Results.push_back(SDValue(Result, 2));
+}
+
+void ARM64TargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue> &Results,
+                                             SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::ATOMIC_LOAD:
+    ReplaceATOMIC_LOAD_128(N, Results, DAG);
+    return;
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_ADD_I128);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_SUB_I128);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_AND_I128);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_OR_I128);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_XOR_I128);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_NAND_I128);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_SWAP_I128);
+    return;
+  case ISD::ATOMIC_LOAD_MIN:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MIN_I128);
+    return;
+  case ISD::ATOMIC_LOAD_MAX:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MAX_I128);
+    return;
+  case ISD::ATOMIC_LOAD_UMIN:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMIN_I128);
+    return;
+  case ISD::ATOMIC_LOAD_UMAX:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMAX_I128);
+    return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_CMP_SWAP_I128);
+    return;
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
+    return;
+  }
+}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
new file mode 100644
index 0000000..a4664ac
--- /dev/null
+++ b/lib/Target/ARM64/ARM64ISelLowering.h
@@ -0,0 +1,422 @@
+//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H
+#define LLVM_TARGET_ARM64_ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace ARM64ISD {
+
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF
+};
+
+} // end namespace ARM64ISD
+
+class ARM64Subtarget;
+class ARM64TargetMachine;
+
+class ARM64TargetLowering : public TargetLowering {
+  bool RequireStrictAlign;
+
+public:
+  explicit ARM64TargetLowering(ARM64TargetMachine &TM);
+
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// computeMaskedBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                      APInt &KnownOne, const SelectionDAG &DAG,
+                                      unsigned Depth = 0) const;
+
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                     bool *Fast = 0) const override {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
+
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  unsigned getMaximalGlobalOffset() const override;
+
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
+
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+  MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned Size, unsigned BinOpcode) const;
+  MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
+                                       unsigned Size) const;
+  MachineBasicBlock *EmitAtomicBinary128(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned BinOpcodeLo,
+                                         unsigned BinOpcodeHi) const;
+  MachineBasicBlock *EmitAtomicCmpSwap128(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitAtomicMinMax128(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned CondCode) const;
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *MBB) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool hasPairedLoad(Type *LoadedType,
+                     unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+  const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
+
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
+
+private:
+  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARM64Subtarget *Subtarget;
+
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
+
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                                  const char *constraint) const;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const;
+};
+
+namespace ARM64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace ARM64
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_ARM64_ISELLOWERING_H
diff --git a/lib/Target/ARM64/ARM64InstrAtomics.td b/lib/Target/ARM64/ARM64InstrAtomics.td
new file mode 100644
index 0000000..0d36e06
--- /dev/null
+++ b/lib/Target/ARM64/ARM64InstrAtomics.td
@@ -0,0 +1,293 @@
+//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ARM64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> ro_indexed8:$addr),
+          (LDRBBro ro_indexed8:$addr)>;
+def : Pat<(relaxed_load<atomic_load_8> am_indexed8:$addr),
+          (LDRBBui am_indexed8:$addr)>;
+def : Pat<(relaxed_load<atomic_load_8> am_unscaled8:$addr),
+          (LDURBBi am_unscaled8:$addr)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> ro_indexed16:$addr),
+          (LDRHHro ro_indexed16:$addr)>;
+def : Pat<(relaxed_load<atomic_load_16> am_indexed16:$addr),
+          (LDRHHui am_indexed16:$addr)>;
+def : Pat<(relaxed_load<atomic_load_16> am_unscaled16:$addr),
+          (LDURHHi am_unscaled16:$addr)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> ro_indexed32:$addr),
+          (LDRWro ro_indexed32:$addr)>;
+def : Pat<(relaxed_load<atomic_load_32> am_indexed32:$addr),
+          (LDRWui am_indexed32:$addr)>;
+def : Pat<(relaxed_load<atomic_load_32> am_unscaled32:$addr),
+          (LDURWi am_unscaled32:$addr)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> ro_indexed64:$addr),
+          (LDRXro ro_indexed64:$addr)>;
+def : Pat<(relaxed_load<atomic_load_64> am_indexed64:$addr),
+          (LDRXui am_indexed64:$addr)>;
+def : Pat<(relaxed_load<atomic_load_64> am_unscaled64:$addr),
+          (LDURXi am_unscaled64:$addr)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on ARM64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8> ro_indexed8:$ptr, GPR32:$val),
+          (STRBBro GPR32:$val, ro_indexed8:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8> am_indexed8:$ptr, GPR32:$val),
+          (STRBBui GPR32:$val, am_indexed8:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8> am_unscaled8:$ptr, GPR32:$val),
+          (STURBBi GPR32:$val, am_unscaled8:$ptr)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> ro_indexed16:$ptr, GPR32:$val),
+          (STRHHro GPR32:$val, ro_indexed16:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> am_indexed16:$ptr, GPR32:$val),
+          (STRHHui GPR32:$val, am_indexed16:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> am_unscaled16:$ptr, GPR32:$val),
+          (STURHHi GPR32:$val, am_unscaled16:$ptr)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> ro_indexed32:$ptr, GPR32:$val),
+          (STRWro GPR32:$val, ro_indexed32:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> am_indexed32:$ptr, GPR32:$val),
+          (STRWui GPR32:$val, am_indexed32:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> am_unscaled32:$ptr, GPR32:$val),
+          (STURWi GPR32:$val, am_unscaled32:$ptr)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> ro_indexed64:$ptr, GPR64:$val),
+          (STRXro GPR64:$val, ro_indexed64:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> am_indexed64:$ptr, GPR64:$val),
+          (STRXui GPR64:$val, am_indexed64:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> am_unscaled64:$ptr, GPR64:$val),
+          (STURXi GPR64:$val, am_unscaled64:$ptr)>;
+
+//===----------------------------------
+// Atomic read-modify-write operations
+//===----------------------------------
+
+// More complicated operations need lots of C++ support, so we just create
+// skeletons here for the C++ code to refer to.
+
+let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
+multiclass AtomicSizes {
+  def _I8 : Pseudo<(outs GPR32:$dst),
+                   (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
+  def _I16 : Pseudo<(outs GPR32:$dst),
+                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
+  def _I32 : Pseudo<(outs GPR32:$dst),
+                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
+  def _I64 : Pseudo<(outs GPR64:$dst),
+                    (ins GPR64sp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
+  def _I128 : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
+                     (ins GPR64sp:$ptr, GPR64:$incrlo, GPR64:$incrhi,
+                          i32imm:$ordering), []>;
+}
+}
+
+defm ATOMIC_LOAD_ADD  : AtomicSizes;
+defm ATOMIC_LOAD_SUB  : AtomicSizes;
+defm ATOMIC_LOAD_AND  : AtomicSizes;
+defm ATOMIC_LOAD_OR   : AtomicSizes;
+defm ATOMIC_LOAD_XOR  : AtomicSizes;
+defm ATOMIC_LOAD_NAND : AtomicSizes;
+defm ATOMIC_SWAP      : AtomicSizes;
+let Defs = [CPSR] in {
+  // These operations need a CMP to calculate the correct value
+  defm ATOMIC_LOAD_MIN  : AtomicSizes;
+  defm ATOMIC_LOAD_MAX  : AtomicSizes;
+  defm ATOMIC_LOAD_UMIN : AtomicSizes;
+  defm ATOMIC_LOAD_UMAX : AtomicSizes;
+}
+
+class AtomicCmpSwap<RegisterClass GPRData>
+  : Pseudo<(outs GPRData:$dst),
+           (ins GPR64sp:$ptr, GPRData:$old, GPRData:$new,
+                i32imm:$ordering), []> {
+  let usesCustomInserter = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let Defs = [CPSR];
+}
+
+def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
+def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
+def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
+def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
+
+def ATOMIC_CMP_SWAP_I128
+  : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
+           (ins GPR64sp:$ptr, GPR64:$oldlo, GPR64:$oldhi,
+                GPR64:$newlo, GPR64:$newhi, i32imm:$ordering), []> {
+  let usesCustomInserter = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let Defs = [CPSR];
+}
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 am_noindex:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
+def : Pat<(ldxr_2 am_noindex:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
+def : Pat<(ldxr_4 am_noindex:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
+def : Pat<(ldxr_8 am_noindex:$addr), (LDXRX am_noindex:$addr)>;
+
+def : Pat<(and (ldxr_1 am_noindex:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 am_noindex:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 am_noindex:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_arm64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(stxr_1 GPR64:$val, am_noindex:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, am_noindex:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, am_noindex:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, am_noindex:$addr),
+          (STXRX GPR64:$val, am_noindex:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), am_noindex:$addr),
+          (STXRB GPR32:$val, am_noindex:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), am_noindex:$addr),
+          (STXRH GPR32:$val, am_noindex:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), am_noindex:$addr),
+          (STXRW GPR32:$val, am_noindex:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), am_noindex:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), am_noindex:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), am_noindex:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_arm64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td
new file mode 100644
index 0000000..38406f8
--- /dev/null
+++ b/lib/Target/ARM64/ARM64InstrFormats.td
@@ -0,0 +1,8193 @@
+//===- ARM64InstrFormats.td - ARM64 Instruction Formats ------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe ARM64 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
+
+// ARM64 Instruction Format
+class ARM64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+  let Namespace   = "ARM64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
+}
+
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : ARM64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
+}
+
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : ARM64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
+  let Size = 4;
+}
+
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
+}
+
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
+
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
+}
+
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+def ArithmeticShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter";
+}
+
+// Shifter operand for arithmetic shifted encodings for ADD/SUB instructions.
+def AddSubShifterOperand : AsmOperandClass {
+  let SuperClasses = [ArithmeticShifterOperand];
+  let Name = "AddSubShifter";
+}
+
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+}
+
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+}
+
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass { let Name = "Extend"; }
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
+
+// simm7s4 predicate - True if the immediate is a multiple of 4 in the range
+// [-256, 252].
+def SImm7s4Operand : AsmOperandClass {
+  let Name = "SImm7s4";
+  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
+}
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale4";
+}
+
+// simm7s8 predicate - True if the immediate is a multiple of 8 in the range
+// [-512, 504].
+def SImm7s8Operand : AsmOperandClass {
+  let Name = "SImm7s8";
+  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
+}
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale8";
+}
+
+// simm7s16 predicate - True if the immediate is a multiple of 16 in the range
+// [-1024, 1008].
+def SImm7s16Operand : AsmOperandClass {
+  let Name = "SImm7s16";
+  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
+}
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale16";
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmOperandClass { let Name = "Imm0_65535"; }
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+}
+
+def Imm1_8Operand : AsmOperandClass {
+  let Name = "Imm1_8";
+  let DiagnosticType = "InvalidImm1_8";
+}
+def Imm1_16Operand : AsmOperandClass {
+  let Name = "Imm1_16";
+  let DiagnosticType = "InvalidImm1_16";
+}
+def Imm1_32Operand : AsmOperandClass {
+  let Name = "Imm1_32";
+  let DiagnosticType = "InvalidImm1_32";
+}
+def Imm1_64Operand : AsmOperandClass {
+  let Name = "Imm1_64";
+  let DiagnosticType = "InvalidImm1_64";
+}
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+def fixedpoint32 : Operand<i32> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def fixedpoint64 : Operand<i64> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_7Operand : AsmOperandClass { let Name = "Imm0_7"; }
+def Imm0_15Operand : AsmOperandClass { let Name = "Imm0_15"; }
+def Imm0_31Operand : AsmOperandClass { let Name = "Imm0_31"; }
+def Imm0_63Operand : AsmOperandClass { let Name = "Imm0_63"; }
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; }
+def LogicalImm64Operand : AsmOperandClass { let Name = "LogicalImm64"; }
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmOperandClass { let Name = "Imm0_127"; }
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
+}
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+def arith_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = ArithmeticShifterOperand;
+}
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, arith_shift);
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64>;
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+def logical_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = ShifterOperand;
+}
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, logical_shift);
+}
+
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64>;
+
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
+
+// An ADD/SUB immediate shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+def addsub_shift : Operand<i32> {
+  let ParserMatchClass = AddSubShifterOperand;
+}
+
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let MIOperandInfo = (ops i32imm, addsub_shift);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let MIOperandInfo = (ops i32imm, addsub_shift);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printExtend";
+  let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return ARM64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return ARM64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return ARM64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// Sytem management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands>
+    : BaseSystemI<L, (outs), iops, asm, operands> {
+  let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
+  bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
+
+// Hint instructions that take both a CRm and a 3-bit immediate.
+class HintI<string mnemonic>
+    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
+      Sched<[WriteHint]> {
+  bits <7> imm;
+  let Inst{20-12} = 0b000110010;
+  let Inst{11-5} = imm;
+}
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions.
+def SystemRegisterOperand : AsmOperandClass {
+  let Name = "SystemRegister";
+  let ParserMethod = "tryParseSystemRegister";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def sysreg_op : Operand<i32> {
+  let ParserMatchClass = SystemRegisterOperand;
+  let DecoderMethod = "DecodeSystemRegister";
+  let PrintMethod = "printSystemRegister";
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+// FIXME: Some of these def CPSR, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+def SystemCPSRFieldOperand : AsmOperandClass {
+  let Name = "SystemCPSRField";
+  let ParserMethod = "tryParseCPSRField";
+}
+def cpsrfield_op : Operand<i32> {
+  let ParserMatchClass = SystemCPSRFieldOperand;
+  let PrintMethod = "printSystemCPSRField";
+}
+
+let Defs = [CPSR] in
+class MSRcpsrI : SimpleSystemI<0, (ins cpsrfield_op:$cpsr_field, imm0_15:$imm),
+                               "msr", "\t$cpsr_field, $imm">,
+                 Sched<[WriteSys]> {
+  bits<6> cpsrfield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = cpsrfield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = cpsrfield{2-0};
+
+  let DecoderMethod = "DecodeSystemCPSRInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemI<bit L, string asm>
+  : SimpleSystemI<L,
+                  (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+                   asm, "\t$op1, $Cn, $Cm, $op2">,
+    Sched<[WriteSys]> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
+}
+
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+// Branch condition code.
+// 4-bit immediate. Pretty-printed as .<cc>
+def dotCcode : Operand<i32> {
+  let PrintMethod = "printDotCondCode";
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def BranchTarget19Operand : AsmOperandClass {
+  let Name = "BranchTarget19";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodeCondBranchTarget";
+  let PrintMethod = "printAlignedBranchTarget";
+  let ParserMatchClass = BranchTarget19Operand;
+}
+
+class BranchCond : I<(outs), (ins dotCcode:$cond, am_brcond:$target),
+                     "b", "$cond\t$target", "",
+                     [(ARM64brcond bb:$target, imm:$cond, CPSR)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [CPSR];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedBranchTarget";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+class TestBranch<bit op, string asm, SDNode node>
+    : I<(outs), (ins GPR64:$Rt, imm0_63:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node GPR64:$Rt, imm0_63:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{31}    = bit_off{5};
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedBranchTarget";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedBranchTarget";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI]> {
+  let Uses = [CPSR];
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, CPSR))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, CPSR)),
+         (implicit CPSR)]> {
+  let Defs = [CPSR];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64]> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS]> {
+  let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2)>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      Sched<[WriteIM32]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      Sched<[WriteIM64]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32]> {
+  let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b011111;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = sf;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
+
+  let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
+}
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0 in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.  This relies on the shifted register aliases
+  // above matching first in the case when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sp, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode> {
+  let isCompare = 1, Defs = [CPSR] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [CPSR]
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.  This relies on the shifted register aliases above matching first
+  // in the case when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sp, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def ARM64Extr : SDNode<"ARM64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> imm;
+
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (ARM64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (ARM64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+let AddedComplexity = 6 in
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+  let isCompare = 1, Defs = [CPSR] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [CPSR]
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting CPSR Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic> {
+  let Defs = [CPSR], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic, []>{
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic, []>{
+    let Inst{31} = 1;
+  }
+  } // Defs = [CPSR]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+      Sched<[WriteI]> {
+  let Uses = [CPSR];
+  let Defs = [CPSR];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsImm<bit op, string asm> {
+  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteI]> {
+  let Uses = [CPSR];
+  let Defs = [CPSR];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsReg<bit op, string asm> {
+  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), CPSR))]>,
+      Sched<[WriteI]> {
+  let Uses = [CPSR];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (ARM64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), CPSR))]>,
+      Sched<[WriteI]> {
+  let Uses = [CPSR];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def MemoryIndexed8Operand : AsmOperandClass {
+  let Name = "MemoryIndexed8";
+  let DiagnosticType = "InvalidMemoryIndexed8";
+}
+def am_indexed8 : Operand<i64>,
+                  ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []> {
+  let PrintMethod = "printAMIndexed8";
+  let EncoderMethod
+      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale1>";
+  let ParserMatchClass = MemoryIndexed8Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+
+// Indexed for 16-bit registers. offset is multiple of 2 in range [0,8190],
+// stored as immval/2 (the 12-bit literal that encodes directly into the insn).
+def MemoryIndexed16Operand : AsmOperandClass {
+  let Name = "MemoryIndexed16";
+  let DiagnosticType = "InvalidMemoryIndexed16";
+}
+def am_indexed16 : Operand<i64>,
+                   ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []> {
+  let PrintMethod = "printAMIndexed16";
+  let EncoderMethod
+      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale2>";
+  let ParserMatchClass = MemoryIndexed16Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+
+// Indexed for 32-bit registers. offset is multiple of 4 in range [0,16380],
+// stored as immval/4 (the 12-bit literal that encodes directly into the insn).
+def MemoryIndexed32Operand : AsmOperandClass {
+  let Name = "MemoryIndexed32";
+  let DiagnosticType = "InvalidMemoryIndexed32";
+}
+def am_indexed32 : Operand<i64>,
+                   ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []> {
+  let PrintMethod = "printAMIndexed32";
+  let EncoderMethod
+      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale4>";
+  let ParserMatchClass = MemoryIndexed32Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+
+// Indexed for 64-bit registers. offset is multiple of 8 in range [0,32760],
+// stored as immval/8 (the 12-bit literal that encodes directly into the insn).
+def MemoryIndexed64Operand : AsmOperandClass {
+  let Name = "MemoryIndexed64";
+  let DiagnosticType = "InvalidMemoryIndexed64";
+}
+def am_indexed64 : Operand<i64>,
+                   ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []> {
+  let PrintMethod = "printAMIndexed64";
+  let EncoderMethod
+      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale8>";
+  let ParserMatchClass = MemoryIndexed64Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+
+// Indexed for 128-bit registers. offset is multiple of 16 in range [0,65520],
+// stored as immval/16 (the 12-bit literal that encodes directly into the insn).
+def MemoryIndexed128Operand : AsmOperandClass {
+  let Name = "MemoryIndexed128";
+  let DiagnosticType = "InvalidMemoryIndexed128";
+}
+def am_indexed128 : Operand<i64>,
+                   ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []> {
+  let PrintMethod = "printAMIndexed128";
+  let EncoderMethod
+      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale16>";
+  let ParserMatchClass = MemoryIndexed128Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+
+// No offset.
+def MemoryNoIndexOperand : AsmOperandClass { let Name = "MemoryNoIndex"; }
+def am_noindex : Operand<i64>,
+                 ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
+  let PrintMethod = "printAMNoIndex";
+  let ParserMatchClass = MemoryNoIndexOperand;
+  let MIOperandInfo = (ops GPR64sp:$base);
+}
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
+  bits<5> dst;
+
+  bits<17> addr;
+  bits<5> base = addr{4-0};
+  bits<12> offset = addr{16-5};
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs regtype:$Rt), (ins indextype:$addr), asm, pattern>,
+      Sched<[WriteLD]>;
+
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+class StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins regtype:$Rt, indextype:$addr), asm, pattern>,
+      Sched<[WriteST]>;
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, am_indexed64:$addr), asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_brcond:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_brcond:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+class MemROAsmOperand<int sz> : AsmOperandClass {
+  let Name = "MemoryRegisterOffset"#sz;
+}
+
+def MemROAsmOperand8 : MemROAsmOperand<8>;
+def MemROAsmOperand16 : MemROAsmOperand<16>;
+def MemROAsmOperand32 : MemROAsmOperand<32>;
+def MemROAsmOperand64 : MemROAsmOperand<64>;
+def MemROAsmOperand128 : MemROAsmOperand<128>;
+
+class ro_indexed<int sz> : Operand<i64> { // ComplexPattern<...>
+  let PrintMethod = "printMemoryRegOffset"#sz;
+  let MIOperandInfo = (ops GPR64sp:$base, GPR64:$offset, i32imm:$extend);
+}
+
+def ro_indexed8 : ro_indexed<8>, ComplexPattern<i64, 3, "SelectAddrModeRO8", []> {
+  let ParserMatchClass = MemROAsmOperand8;
+}
+
+def ro_indexed16 : ro_indexed<16>, ComplexPattern<i64, 3, "SelectAddrModeRO16", []> {
+  let ParserMatchClass = MemROAsmOperand16;
+}
+
+def ro_indexed32 : ro_indexed<32>, ComplexPattern<i64, 3, "SelectAddrModeRO32", []> {
+  let ParserMatchClass = MemROAsmOperand32;
+}
+
+def ro_indexed64 : ro_indexed<64>, ComplexPattern<i64, 3, "SelectAddrModeRO64", []> {
+  let ParserMatchClass = MemROAsmOperand64;
+}
+
+def ro_indexed128 : ro_indexed<128>, ComplexPattern<i64, 3, "SelectAddrModeRO128", []> {
+  let ParserMatchClass = MemROAsmOperand128;
+}
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<5> offset;
+  bits<4> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = offset;
+  let Inst{15-13} = extend{3-1};
+
+  let Inst{12}    = extend{0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
+}
+
+class Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt), (ins ro_indexed8:$addr), pat>,
+    Sched<[WriteLDIdx, ReadAdrBase]>;
+
+class Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs), (ins regtype:$Rt, ro_indexed8:$addr), pat>,
+    Sched<[WriteSTIdx, ReadAdrBase]>;
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<5> offset;
+  bits<4> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = offset;
+  let Inst{15-13} = extend{3-1};
+
+  let Inst{12}    = extend{0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
+}
+
+class Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore16RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt), (ins ro_indexed16:$addr), pat>,
+    Sched<[WriteLDIdx, ReadAdrBase]>;
+
+class Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore16RO<sz, V, opc, regtype, asm,
+                 (outs), (ins regtype:$Rt, ro_indexed16:$addr), pat>,
+    Sched<[WriteSTIdx, ReadAdrBase]>;
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<5> offset;
+  bits<4> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = offset;
+  let Inst{15-13} = extend{3-1};
+
+  let Inst{12}    = extend{0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
+}
+
+class Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore32RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt), (ins ro_indexed32:$addr), pat>,
+    Sched<[WriteLDIdx, ReadAdrBase]>;
+
+class Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore32RO<sz, V, opc, regtype, asm,
+                 (outs), (ins regtype:$Rt, ro_indexed32:$addr), pat>,
+    Sched<[WriteSTIdx, ReadAdrBase]>;
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<5> offset;
+  bits<4> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = offset;
+  let Inst{15-13} = extend{3-1};
+
+  let Inst{12}    = extend{0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore64RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt), (ins ro_indexed64:$addr), pat>,
+    Sched<[WriteLDIdx, ReadAdrBase]>;
+
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+class Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore64RO<sz, V, opc, regtype, asm,
+                 (outs), (ins regtype:$Rt, ro_indexed64:$addr), pat>,
+    Sched<[WriteSTIdx, ReadAdrBase]>;
+
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<5> offset;
+  bits<4> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = offset;
+  let Inst{15-13} = extend{3-1};
+
+  let Inst{12}    = extend{0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore128RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt), (ins ro_indexed128:$addr), pat>,
+    Sched<[WriteLDIdx, ReadAdrBase]>;
+
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+class Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm, list<dag> pat>
+  : LoadStore128RO<sz, V, opc, regtype, asm,
+                 (outs), (ins regtype:$Rt, ro_indexed128:$addr), pat>,
+    Sched<[WriteSTIdx, ReadAdrBase]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, ro_indexed64:$addr), asm,
+         "\t$Rt, $addr", "", pat>,
+      Sched<[WriteLD]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<5> offset;
+  bits<4> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = offset;
+  let Inst{15-13} = extend{3-1};
+
+  let Inst{12}    = extend{0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def MemoryUnscaledOperand : AsmOperandClass {
+  let Name = "MemoryUnscaled";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+class am_unscaled_operand : Operand<i64> {
+  let PrintMethod = "printAMUnscaled";
+  let ParserMatchClass = MemoryUnscaledOperand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+def am_unscaled   : am_unscaled_operand;
+def am_unscaled8  : am_unscaled_operand,
+                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : am_unscaled_operand,
+                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : am_unscaled_operand,
+                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : am_unscaled_operand,
+                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 : am_unscaled_operand,
+                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let AddedComplexity = 1 in // try this before LoadUI
+class LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   Operand amtype, string asm, list<dag> pattern>
+    : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                           (ins amtype:$addr), asm, pattern>,
+      Sched<[WriteLD]>;
+
+let AddedComplexity = 1 in // try this before StoreUI
+class StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    Operand amtype, string asm, list<dag> pattern>
+    : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                           (ins regtype:$Rt, amtype:$addr), asm, pattern>,
+      Sched<[WriteST]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                           (ins prfop:$Rt, am_unscaled:$addr), asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, $addr", "", []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> base;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
+class LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm>
+    : BaseLoadStoreUnprivileged<sz, V, opc,
+                      (outs regtype:$Rt), (ins am_unscaled:$addr), asm>,
+      Sched<[WriteLD]>;
+}
+
+let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
+class StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm>
+    : BaseLoadStoreUnprivileged<sz, V, opc,
+                      (outs), (ins regtype:$Rt, am_unscaled:$addr), asm>,
+      Sched<[WriteST]>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr>
+    : I<oops, iops, asm, "\t$Rt, $addr!", cstr, []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling.
+  bits<5> dst;
+  bits<5> base;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+// FIXME: Modeling the write-back of these instructions for isel is tricky.
+//        we need the complex addressing mode for the memory reference, but
+//        we also need the write-back specified as a tied operand to the
+//        base register. That combination does not play nicely with
+//        the asm matcher and friends.
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs regtype:$Rt/*, GPR64sp:$wback*/),
+                     (ins am_unscaled:$addr), asm, ""/*"$addr.base = $wback"*/>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs/* GPR64sp:$wback*/),
+                      (ins regtype:$Rt, am_unscaled:$addr),
+                       asm, ""/*"$addr.base = $wback"*/>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+// ISel pseudo-instructions which have the tied operands. When the MC lowering
+// logic finally gets smart enough to strip off tied operands that are just
+// for isel convenience, we can get rid of these pseudos and just reference
+// the real instructions directly.
+//
+// Ironically, also because of the writeback operands, we can't put the
+// matcher pattern directly on the instruction, but need to define it
+// separately.
+//
+// Loads aren't matched with patterns here at all, but rather in C++
+// custom lowering.
+let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
+class LoadPreIdxPseudo<RegisterClass regtype>
+    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
+             (ins am_noindex:$addr, simm9:$offset), [],
+              "$addr.base = $wback,@earlyclobber $wback">,
+      Sched<[WriteLD, WriteAdr]>;
+class LoadPostIdxPseudo<RegisterClass regtype>
+    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
+             (ins am_noindex:$addr, simm9:$offset), [],
+              "$addr.base = $wback,@earlyclobber $wback">,
+      Sched<[WriteLD, WriteI]>;
+}
+multiclass StorePreIdxPseudo<RegisterClass regtype, ValueType Ty,
+                             SDPatternOperator OpNode> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def _isel: Pseudo<(outs GPR64sp:$wback),
+                    (ins regtype:$Rt, am_noindex:$addr, simm9:$offset), [],
+                    "$addr.base = $wback,@earlyclobber $wback">,
+      Sched<[WriteAdr, WriteST]>;
+
+  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$offset),
+            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
+                                            simm9:$offset)>;
+}
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr>
+    : I<oops, iops, asm, "\t$Rt, $addr, $idx", cstr, []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling.
+  bits<5> dst;
+  bits<5> base;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+// FIXME: Modeling the write-back of these instructions for isel is tricky.
+//        we need the complex addressing mode for the memory reference, but
+//        we also need the write-back specified as a tied operand to the
+//        base register. That combination does not play nicely with
+//        the asm matcher and friends.
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs regtype:$Rt/*, GPR64sp:$wback*/),
+                      (ins am_noindex:$addr, simm9:$idx),
+                      asm, ""/*"$addr.base = $wback"*/>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs/* GPR64sp:$wback*/),
+                      (ins regtype:$Rt, am_noindex:$addr, simm9:$idx),
+                       asm, ""/*"$addr.base = $wback"*/>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+// ISel pseudo-instructions which have the tied operands. When the MC lowering
+// logic finally gets smart enough to strip off tied operands that are just
+// for isel convenience, we can get rid of these pseudos and just reference
+// the real instructions directly.
+//
+// Ironically, also because of the writeback operands, we can't put the
+// matcher pattern directly on the instruction, but need to define it
+// separately.
+multiclass StorePostIdxPseudo<RegisterClass regtype, ValueType Ty,
+                              SDPatternOperator OpNode, Instruction Insn> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def _isel: Pseudo<(outs GPR64sp:$wback),
+                    (ins regtype:$Rt, am_noindex:$addr, simm9:$idx), [],
+                    "$addr.base = $wback,@earlyclobber $wback">,
+      PseudoInstExpansion<(Insn regtype:$Rt, am_noindex:$addr, simm9:$idx)>,
+      Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+
+  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$idx),
+            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
+                                            simm9:$idx)>;
+}
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> dst2;
+  bits<5> base;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = dst2;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairOffset<opc, V, 1,
+                              (outs regtype:$Rt, regtype:$Rt2),
+                              (ins indextype:$addr), asm>,
+      Sched<[WriteLD, WriteLDHi]>;
+
+let mayLoad = 0, mayStore = 1 in
+class StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
+                             asm>,
+      Sched<[WriteSTP]>;
+} // hasSideEffects = 0
+
+// (pre-indexed)
+
+def MemoryIndexed32SImm7 : AsmOperandClass {
+  let Name = "MemoryIndexed32SImm7";
+  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
+}
+def am_indexed32simm7 : Operand<i32> { // ComplexPattern<...>
+  let PrintMethod = "printAMIndexed32";
+  let ParserMatchClass = MemoryIndexed32SImm7;
+  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
+}
+
+def MemoryIndexed64SImm7 : AsmOperandClass {
+  let Name = "MemoryIndexed64SImm7";
+  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
+}
+def am_indexed64simm7 : Operand<i32> { // ComplexPattern<...>
+  let PrintMethod = "printAMIndexed64";
+  let ParserMatchClass = MemoryIndexed64SImm7;
+  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
+}
+
+def MemoryIndexed128SImm7 : AsmOperandClass {
+  let Name = "MemoryIndexed128SImm7";
+  let DiagnosticType = "InvalidMemoryIndexed128SImm7";
+}
+def am_indexed128simm7 : Operand<i32> { // ComplexPattern<...>
+  let PrintMethod = "printAMIndexed128";
+  let ParserMatchClass = MemoryIndexed128SImm7;
+  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
+}
+
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr!", "", []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> dst2;
+  bits<5> base;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b011;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = dst2;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand addrmode, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs regtype:$Rt, regtype:$Rt2),
+                              (ins addrmode:$addr), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand addrmode, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs),
+                             (ins regtype:$Rt, regtype:$Rt2, addrmode:$addr),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr, $idx", "", []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> dst2;
+  bits<5> base;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = dst2;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs regtype:$Rt, regtype:$Rt2),
+                              (ins am_noindex:$addr, idxtype:$idx), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  am_noindex:$addr, idxtype:$idx),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> dst2;
+  bits<5> base;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b000;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = dst2;
+  let Inst{9-5}   = base;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairNoAlloc<opc, V, 1,
+                              (outs regtype:$Rt, regtype:$Rt2),
+                              (ins indextype:$addr), asm>,
+      Sched<[WriteLD, WriteLDHi]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
+                             asm>,
+      Sched<[WriteSTP]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+  bits<5> reg;
+  bits<5> base;
+  let Inst{20-16} = 0b11111;
+  let Inst{14-10} = 0b11111;
+  let Inst{9-5} = base;
+  let Inst{4-0} = reg;
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins am_noindex:$addr), asm,
+                             "\t$Rt, $Rt2, $addr">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> dst1;
+  bits<5> dst2;
+  bits<5> base;
+  let Inst{20-16} = 0b11111;
+  let Inst{14-10} = dst2;
+  let Inst{9-5} = base;
+  let Inst{4-0} = dst1;
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, am_noindex:$addr),
+                               asm, "\t$Rt, $addr">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, am_noindex:$addr),
+                             asm, "\t$Ws, $Rt, $addr">,
+      Sched<[WriteSTX]> {
+  bits<5> status;
+  bits<5> reg;
+  bits<5> base;
+  let Inst{20-16} = status;
+  let Inst{14-10} = 0b11111;
+  let Inst{9-5} = base;
+  let Inst{4-0} = reg;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, am_noindex:$addr),
+                              asm, "\t$Ws, $Rt, $Rt2, $addr">,
+      Sched<[WriteSTX]> {
+  bits<5> status;
+  bits<5> dst1;
+  bits<5> dst2;
+  bits<5> base;
+  let Inst{20-16} = status;
+  let Inst{14-10} = dst2;
+  let Inst{9-5} = base;
+  let Inst{4-0} = dst1;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30}    = 0;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", []>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30}    = 0;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPToInteger<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> {
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", []>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd[1], $Rn", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn[1]", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+
+multiclass UnscaledConversion<string asm> {
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm#".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm#".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def : InstAlias<asm#"$Vd.d[1], $Rn",
+                  (!cast<Instruction>(NAME#XDHighr) V128:$Vd, GPR64:$Rn), 0>;
+  def : InstAlias<asm#"$Rd, $Vn.d[1]",
+                  (!cast<Instruction>(NAME#DXHighr) GPR64:$Rd, V128:$Vn), 0>;
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, []>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, []>;
+
+  // Half-precision to Single-precision
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, []>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, []>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-23} = 0b000111110;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+
+  let Inst{20-16} = 0b00000;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [CPSR] in {
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit CPSR)]> {
+    let Inst{22} = 0;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit CPSR)]> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit CPSR)]> {
+    let Inst{22} = 1;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit CPSR)]> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [CPSR]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+                              RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+  let Defs = [CPSR], Uses = [CPSR] in {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [CPSR], Uses = [CPSR]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (ARM64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), CPSR))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [CPSR] in {
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Uses = [CPSR]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{22} = 1;
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+def VectorIndexBOperand : AsmOperandClass { let Name = "VectorIndexB"; }
+def VectorIndexHOperand : AsmOperandClass { let Name = "VectorIndexH"; }
+def VectorIndexSOperand : AsmOperandClass { let Name = "VectorIndexS"; }
+def VectorIndexDOperand : AsmOperandClass { let Name = "VectorIndexD"; }
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string dstkind,
+                        string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = size;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let neverHasSideEffects = 1 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand regtype, string asm, string kind,
+                           ValueType dty, ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #0" #
+      "|" # kind # "\t$Rd, $Rn, #0}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+                                     asm, ".8b",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+                                     asm, ".16b",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+                                     asm, ".4h",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+                                     asm, ".8h",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+                                     asm, ".2s",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+                                     asm, ".4s",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+                                     asm, ".2d",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+                                     asm, ".2s",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+                                     asm, ".4s",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+                                     asm, ".2d",
+                                     v2i64, v2f64, OpNode>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                            V128, V64, V64,
+                                            asm, ".1q", ".1d", ".1d", []>;
+  def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".1q", ".2d", ".2d", []>;
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_arm64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_arm64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_arm64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (ARM64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b">;
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #0", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_arm64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm>;
+
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm>;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm>;
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (ARM64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, ARM64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, ARM64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, ARM64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, ARM64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #" $dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = 0;
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = 0;
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = 0;
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index)>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
+  let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
+  let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (ARM64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (ARM64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (ARM64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (ARM64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (ARM64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (ARM64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (ARM64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_arm64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_arm64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (ARM64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_arm64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_arm64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (ARM64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_arm64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn),
+                                     (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general am_noindex handling.
+def MemorySIMDNoIndexOperand : AsmOperandClass {
+  let Name = "MemorySIMDNoIndex";
+  let ParserMethod = "tryParseNoIndexMemory";
+}
+def am_simdnoindex : Operand<i64>,
+                     ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
+  let PrintMethod = "printAMNoIndex";
+  let ParserMatchClass = MemorySIMDNoIndexOperand;
+  let MIOperandInfo = (ops GPR64sp:$base);
+  let DecoderMethod = "DecodeGPR64spRegisterClass";
+}
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, $vaddr", "", pattern> {
+  bits<5> Vt;
+  bits<5> vaddr;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = L;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = vaddr;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, $vaddr, $Xm", "", []> {
+  bits<5> Vt;
+  bits<5> vaddr;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = vaddr;
+  let Inst{4-0} = Vt;
+  let DecoderMethod = "DecodeSIMDLdStPost";
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, $vaddr, #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, am_simdnoindex:$vaddr, XZR)
+  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      am_simdnoindex:$vaddr, XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, $vaddr, #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      am_simdnoindex:$vaddr, XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, $vaddr"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, am_simdnoindex:$vaddr)
+  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      am_simdnoindex:$vaddr), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, $vaddr, $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      am_simdnoindex:$vaddr,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 am_simdnoindex:$vaddr), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins am_simdnoindex:$vaddr), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                am_simdnoindex:$vaddr), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, (outs),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            am_simdnoindex:$vaddr,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, dag oops, dag iops,
+                         list<dag> pattern>
+  : I<oops, iops, asm, operands, "", pattern> {
+  bits<5> Vt;
+  bits<5> vaddr;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = vaddr;
+  let Inst{4-0} = Vt;
+  let DecoderMethod = "DecodeSIMDLdStSingle";
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, dag oops, dag iops,
+                         list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst", pattern> {
+  bits<5> Vt;
+  bits<5> vaddr;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = vaddr;
+  let Inst{4-0} = Vt;
+  let DecoderMethod = "DecodeSIMDLdStSingleTied";
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr",
+                       (outs listtype:$Vt), (ins am_simdnoindex:$vaddr), []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr, $Xm",
+                       (outs listtype:$Vt),
+                       (ins am_simdnoindex:$vaddr, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, $vaddr, #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
+  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      am_simdnoindex:$vaddr, XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, $vaddr, #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      am_simdnoindex:$vaddr, XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, $vaddr"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
+  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      am_simdnoindex:$vaddr), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, $vaddr, $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      am_simdnoindex:$vaddr,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
+
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
+                       oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                am_simdnoindex:$vaddr), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 am_simdnoindex:$vaddr), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 am_simdnoindex:$vaddr), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 am_simdnoindex:$vaddr), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        am_simdnoindex:$vaddr), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         am_simdnoindex:$vaddr), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         am_simdnoindex:$vaddr), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         am_simdnoindex:$vaddr), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
+}
+
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, $vaddr, #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, $vaddr, #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, am_simdnoindex:$vaddr, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, $vaddr, #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, am_simdnoindex:$vaddr, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, $vaddr"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, am_simdnoindex:$vaddr), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, $vaddr, $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, am_simdnoindex:$vaddr,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+}
+
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+}
+
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp
new file mode 100644
index 0000000..8f11757
--- /dev/null
+++ b/lib/Target/ARM64/ARM64InstrInfo.cpp
@@ -0,0 +1,1864 @@
+//===- ARM64InstrInfo.cpp - ARM64 Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64InstrInfo.h"
+#include "ARM64Subtarget.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "ARM64GenInstrInfo.inc"
+
+using namespace llvm;
+
+ARM64InstrInfo::ARM64InstrInfo(const ARM64Subtarget &STI)
+    : ARM64GenInstrInfo(ARM64::ADJCALLSTACKDOWN, ARM64::ADJCALLSTACKUP),
+      RI(this, &STI), Subtarget(STI) {}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned ARM64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+    return 4;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    return 0;
+  }
+
+  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case ARM64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case ARM64::CBZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZW:
+  case ARM64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
+  }
+}
+
+// Branch analysis.
+bool ARM64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+bool ARM64InstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    ARM64CC::CondCode CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(ARM64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case ARM64::CBZW:
+      Cond[1].setImm(ARM64::CBNZW);
+      break;
+    case ARM64::CBNZW:
+      Cond[1].setImm(ARM64::CBZW);
+      break;
+    case ARM64::CBZX:
+      Cond[1].setImm(ARM64::CBNZX);
+      break;
+    case ARM64::CBNZX:
+      Cond[1].setImm(ARM64::CBZX);
+      break;
+    case ARM64::TBZ:
+      Cond[1].setImm(ARM64::TBNZ);
+      break;
+    case ARM64::TBNZ:
+      Cond[1].setImm(ARM64::TBZ);
+      break;
+    }
+  }
+
+  return false;
+}
+
+unsigned ARM64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void ARM64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    const SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(ARM64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
+
+unsigned ARM64InstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(ARM64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(ARM64::B)).addMBB(FBB);
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = 0) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = ARM64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case ARM64::ADDSXri:
+  case ARM64::ADDSWri:
+    // if CPSR is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(ARM64::CPSR, true) == -1)
+      return 0;
+  // fall-through to ADDXri and ADDWri.
+  case ARM64::ADDXri:
+  case ARM64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? ARM64::CSINCXr : ARM64::CSINCWr;
+    break;
+
+  case ARM64::ORNXrr:
+  case ARM64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? ARM64::CSINVXr : ARM64::CSINVWr;
+    break;
+  }
+
+  case ARM64::SUBSXrr:
+  case ARM64::SUBSWrr:
+    // if CPSR is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(ARM64::CPSR, true) == -1)
+      return 0;
+  // fall-through to SUBXrr and SUBWrr.
+  case ARM64::SUBXrr:
+  case ARM64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? ARM64::CSNEGXr : ARM64::CSNEGWr;
+    break;
+  }
+  default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool ARM64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (ARM64::GPR64allRegClass.hasSubClassEq(RC) ||
+      ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
+  }
+
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (ARM64::FPR64RegClass.hasSubClassEq(RC) ||
+      ARM64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void ARM64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DstReg,
+                                  const SmallVectorImpl<MachineOperand> &Cond,
+                                  unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  ARM64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = ARM64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case ARM64::CBZW:
+      Is64Bit = 0;
+      CC = ARM64CC::EQ;
+      break;
+    case ARM64::CBZX:
+      Is64Bit = 1;
+      CC = ARM64CC::EQ;
+      break;
+    case ARM64::CBNZW:
+      Is64Bit = 0;
+      CC = ARM64CC::NE;
+      break;
+    case ARM64::CBNZX:
+      Is64Bit = 1;
+      CC = ARM64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &ARM64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(ARM64::SUBSXri), ARM64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &ARM64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(ARM64::SUBSWri), ARM64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case ARM64::TBZ:
+      CC = ARM64CC::EQ;
+      break;
+    case ARM64::TBNZ:
+      CC = ARM64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    BuildMI(MBB, I, DL, get(ARM64::ANDSXri), ARM64::XZR)
+        .addReg(Cond[2].getReg())
+        .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
+
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = 0;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &ARM64::GPR64RegClass)) {
+    RC = &ARM64::GPR64RegClass;
+    Opc = ARM64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &ARM64::GPR32RegClass)) {
+    RC = &ARM64::GPR32RegClass;
+    Opc = ARM64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR64RegClass)) {
+    RC = &ARM64::FPR64RegClass;
+    Opc = ARM64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR32RegClass)) {
+    RC = &ARM64::FPR32RegClass;
+    Opc = ARM64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = ARM64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
+  }
+
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
+
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
+}
+
+bool ARM64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                           unsigned &SrcReg, unsigned &DstReg,
+                                           unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case ARM64::SBFMXri: // aka sxtw
+  case ARM64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = ARM64::sub_32;
+    return true;
+  }
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool ARM64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                                    unsigned &SrcReg2, int &CmpMask,
+                                    int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::SUBSWrr:
+  case ARM64::SUBSWrs:
+  case ARM64::SUBSWrx:
+  case ARM64::SUBSXrr:
+  case ARM64::SUBSXrs:
+  case ARM64::SUBSXrx:
+  case ARM64::ADDSWrr:
+  case ARM64::ADDSWrs:
+  case ARM64::ADDSWrx:
+  case ARM64::ADDSXrr:
+  case ARM64::ADDSXrs:
+  case ARM64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if CPSR is not used.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case ARM64::SUBSWri:
+  case ARM64::ADDSWri:
+  case ARM64::ANDSWri:
+  case ARM64::SUBSXri:
+  case ARM64::ADDSXri:
+  case ARM64::ANDSXri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  }
+
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+  MachineBasicBlock *MBB = Instr->getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetMachine *TM = &MF->getTarget();
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr->getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
+  }
+
+  return true;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool ARM64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if CPSR is not used.
+  int Cmp_CPSR = CmpInstr->findRegisterDefOperandIdx(ARM64::CPSR, true);
+  if (Cmp_CPSR != -1) {
+    unsigned NewOpc;
+    switch (CmpInstr->getOpcode()) {
+    default:
+      return false;
+    case ARM64::ADDSWrr:      NewOpc = ARM64::ADDWrr; break;
+    case ARM64::ADDSWri:      NewOpc = ARM64::ADDWri; break;
+    case ARM64::ADDSWrs:      NewOpc = ARM64::ADDWrs; break;
+    case ARM64::ADDSWrx:      NewOpc = ARM64::ADDWrx; break;
+    case ARM64::ADDSXrr:      NewOpc = ARM64::ADDXrr; break;
+    case ARM64::ADDSXri:      NewOpc = ARM64::ADDXri; break;
+    case ARM64::ADDSXrs:      NewOpc = ARM64::ADDXrs; break;
+    case ARM64::ADDSXrx:      NewOpc = ARM64::ADDXrx; break;
+    case ARM64::SUBSWrr:      NewOpc = ARM64::SUBWrr; break;
+    case ARM64::SUBSWri:      NewOpc = ARM64::SUBWri; break;
+    case ARM64::SUBSWrs:      NewOpc = ARM64::SUBWrs; break;
+    case ARM64::SUBSWrx:      NewOpc = ARM64::SUBWrx; break;
+    case ARM64::SUBSXrr:      NewOpc = ARM64::SUBXrr; break;
+    case ARM64::SUBSXri:      NewOpc = ARM64::SUBXri; break;
+    case ARM64::SUBSXrs:      NewOpc = ARM64::SUBXrs; break;
+    case ARM64::SUBSXrx:      NewOpc = ARM64::SUBXrx; break;
+    }
+
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr->setDesc(MCID);
+    CmpInstr->RemoveOperand(Cmp_CPSR);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
+
+  // Continue only if we have a "ri" where immediate is zero.
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
+
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    return false;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of the source register or done with the
+  // basic block, to check whether CPSR is used or modified in between.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, we can't optimize away the Compare.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(ARM64::CPSR, TRI) ||
+        Instr.readsRegister(ARM64::CPSR, TRI))
+      // This instruction modifies or uses CPSR after the one we want to
+      // change. We can't do this transformation.
+      return false;
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  unsigned NewOpc = MI->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case ARM64::ADDSWrr:
+  case ARM64::ADDSWri:
+  case ARM64::ADDSXrr:
+  case ARM64::ADDSXri:
+  case ARM64::SUBSWrr:
+  case ARM64::SUBSWri:
+  case ARM64::SUBSXrr:
+  case ARM64::SUBSXri:
+    break;
+  case ARM64::ADDWrr:    NewOpc = ARM64::ADDSWrr; break;
+  case ARM64::ADDWri:    NewOpc = ARM64::ADDSWri; break;
+  case ARM64::ADDXrr:    NewOpc = ARM64::ADDSXrr; break;
+  case ARM64::ADDXri:    NewOpc = ARM64::ADDSXri; break;
+  case ARM64::ADCWr:     NewOpc = ARM64::ADCSWr; break;
+  case ARM64::ADCXr:     NewOpc = ARM64::ADCSXr; break;
+  case ARM64::SUBWrr:    NewOpc = ARM64::SUBSWrr; break;
+  case ARM64::SUBWri:    NewOpc = ARM64::SUBSWri; break;
+  case ARM64::SUBXrr:    NewOpc = ARM64::SUBSXrr; break;
+  case ARM64::SUBXri:    NewOpc = ARM64::SUBSXri; break;
+  case ARM64::SBCWr:     NewOpc = ARM64::SBCSWr; break;
+  case ARM64::SBCXr:     NewOpc = ARM64::SBCSXr; break;
+  case ARM64::ANDWri:    NewOpc = ARM64::ANDSWri; break;
+  case ARM64::ANDXri:    NewOpc = ARM64::ANDSXri; break;
+  }
+
+  // Scan forward for the use of CPSR.
+  // When checking against MI: if it's a conditional code requires
+  // checking of V bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if CPSR is redefined or killed.
+  // If we are done with the basic block, we need to check whether CPSR is
+  // live-out.
+  bool IsSafe = false;
+  for (MachineBasicBlock::iterator I = CmpInstr,
+                                   E = CmpInstr->getParent()->end();
+       !IsSafe && ++I != E;) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+         ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(ARM64::CPSR)) {
+        IsSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != ARM64::CPSR)
+        continue;
+      if (MO.isDef()) {
+        IsSafe = true;
+        break;
+      }
+
+      // Decode the condition code.
+      unsigned Opc = Instr.getOpcode();
+      ARM64CC::CondCode CC;
+      switch (Opc) {
+      default:
+        return false;
+      case ARM64::Bcc:
+        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+        break;
+      case ARM64::CSINVWr:
+      case ARM64::CSINVXr:
+      case ARM64::CSINCWr:
+      case ARM64::CSINCXr:
+      case ARM64::CSELWr:
+      case ARM64::CSELXr:
+      case ARM64::CSNEGWr:
+      case ARM64::CSNEGXr:
+        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+        break;
+      }
+
+      // It is not safe to remove Compare instruction if Overflow(V) is used.
+      switch (CC) {
+      default:
+        // CPSR can be used multiple times, we should continue.
+        break;
+      case ARM64CC::VS:
+      case ARM64CC::VC:
+      case ARM64CC::GE:
+      case ARM64CC::LT:
+      case ARM64CC::GT:
+      case ARM64CC::LE:
+        return false;
+      }
+    }
+  }
+
+  // If CPSR is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr->getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                          SE = MBB->succ_end();
+         SI != SE; ++SI)
+      if ((*SI)->isLiveIn(ARM64::CPSR))
+        return false;
+  }
+
+  // Update the instruction to set CPSR.
+  MI->setDesc(get(NewOpc));
+  CmpInstr->eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(ARM64::CPSR, TRI);
+  return true;
+}
+
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::MOVZWi:
+  case ARM64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 3 &&
+             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case ARM64::ANDWri: // and Rd, Rzr, #imm
+    return MI->getOperand(1).getReg() == ARM64::WZR;
+  case ARM64::ANDXri:
+    return MI->getOperand(1).getReg() == ARM64::XZR;
+  case TargetOpcode::COPY:
+    return MI->getOperand(1).getReg() == ARM64::WZR;
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool ARM64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (ARM64::GPR32RegClass.contains(DstReg) ||
+            ARM64::GPR64RegClass.contains(DstReg));
+  }
+  case ARM64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI->getOperand(1).getReg() == ARM64::XZR) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+  case ARM64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI->getOperand(2).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool ARM64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (ARM64::FPR64RegClass.contains(DstReg) ||
+            ARM64::FPR128RegClass.contains(DstReg));
+  }
+  case ARM64::ORRv16i8:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned ARM64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::LDRWui:
+  case ARM64::LDRXui:
+  case ARM64::LDRBui:
+  case ARM64::LDRHui:
+  case ARM64::LDRSui:
+  case ARM64::LDRDui:
+  case ARM64::LDRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARM64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::STRWui:
+  case ARM64::STRXui:
+  case ARM64::STRBui:
+  case ARM64::STRHui:
+  case ARM64::STRSui:
+  case ARM64::STRDui:
+  case ARM64::STRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool ARM64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::LDRBBro:
+  case ARM64::LDRBro:
+  case ARM64::LDRDro:
+  case ARM64::LDRHHro:
+  case ARM64::LDRHro:
+  case ARM64::LDRQro:
+  case ARM64::LDRSBWro:
+  case ARM64::LDRSBXro:
+  case ARM64::LDRSHWro:
+  case ARM64::LDRSHXro:
+  case ARM64::LDRSWro:
+  case ARM64::LDRSro:
+  case ARM64::LDRWro:
+  case ARM64::LDRXro:
+  case ARM64::STRBBro:
+  case ARM64::STRBro:
+  case ARM64::STRDro:
+  case ARM64::STRHHro:
+  case ARM64::STRHro:
+  case ARM64::STRQro:
+  case ARM64::STRSro:
+  case ARM64::STRWro:
+  case ARM64::STRXro:
+    unsigned Val = MI->getOperand(3).getImm();
+    ARM64_AM::ExtendType ExtType = ARM64_AM::getMemExtendType(Val);
+    return (ExtType != ARM64_AM::UXTX) || ARM64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
+
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool ARM64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  for (MachineInstr::mmo_iterator MM = MI->memoperands_begin(),
+                                  E = MI->memoperands_end();
+       MM != E; ++MM) {
+
+    if ((*MM)->getFlags() &
+        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void ARM64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+  if (MI->memoperands_empty())
+    return;
+
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  (*MI->memoperands_begin())
+      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
+
+bool ARM64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                          unsigned &Offset,
+                                          const TargetRegisterInfo *TRI) const {
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case ARM64::STRSui:
+  case ARM64::STRDui:
+  case ARM64::STRQui:
+  case ARM64::STRXui:
+  case ARM64::STRWui:
+  case ARM64::LDRSui:
+  case ARM64::LDRDui:
+  case ARM64::LDRQui:
+  case ARM64::LDRXui:
+  case ARM64::LDRWui:
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+      return false;
+    BaseReg = LdSt->getOperand(1).getReg();
+    MachineFunction &MF = *LdSt->getParent()->getParent();
+    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+    Offset = LdSt->getOperand(2).getImm() * Width;
+    return true;
+  };
+}
+
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool ARM64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                        MachineInstr *SecondLdSt,
+                                        unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+    return false;
+  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+  // Allow 6 bits of positive range.
+  if (Ofs1 > 64)
+    return false;
+  // The caller should already have ordered First/SecondLdSt by offset.
+  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+  return Ofs1 + 1 == Ofs2;
+}
+
+bool ARM64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+                                            MachineInstr *Second) const {
+  // Cyclone can fuse CMN, CMP followed by Bcc.
+
+  // FIXME: B0 can also fuse:
+  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+  if (Second->getOpcode() != ARM64::Bcc)
+    return false;
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case ARM64::SUBSWri:
+  case ARM64::ADDSWri:
+  case ARM64::ANDSWri:
+  case ARM64::SUBSXri:
+  case ARM64::ADDSXri:
+  case ARM64::ANDSXri:
+    return true;
+  }
+}
+
+MachineInstr *ARM64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                                       int FrameIx,
+                                                       uint64_t Offset,
+                                                       const MDNode *MDPtr,
+                                                       DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(MDPtr);
+  return &*MIB;
+}
+
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
+
+void ARM64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      DebugLoc DL, unsigned DestReg,
+                                      unsigned SrcReg, bool KillSrc,
+                                      unsigned Opcode,
+                                      llvm::ArrayRef<unsigned> Indices) const {
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
+
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
+
+void ARM64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (ARM64::GPR32spRegClass.contains(DestReg) &&
+      (ARM64::GPR32spRegClass.contains(SrcReg) || SrcReg == ARM64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == ARM64::WSP || SrcReg == ARM64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
+                                                     &ARM64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
+                                                    &ARM64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(ARM64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+      }
+    } else if (SrcReg == ARM64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(ARM64::MOVZWi), DestReg).addImm(0).addImm(
+          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
+                                                     &ARM64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
+                                                    &ARM64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestRegX)
+            .addReg(ARM64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(ARM64::ORRWrr), DestReg)
+            .addReg(ARM64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+    }
+    return;
+  }
+
+  if (ARM64::GPR64spRegClass.contains(DestReg) &&
+      (ARM64::GPR64spRegClass.contains(SrcReg) || SrcReg == ARM64::XZR)) {
+    if (DestReg == ARM64::SP || SrcReg == ARM64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+    } else if (SrcReg == ARM64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(ARM64::MOVZXi), DestReg).addImm(0).addImm(
+          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestReg)
+          .addReg(ARM64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (ARM64::DDDDRegClass.contains(DestReg) &&
+      ARM64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
+                                        ARM64::dsub2, ARM64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (ARM64::DDDRegClass.contains(DestReg) &&
+      ARM64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
+                                        ARM64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (ARM64::DDRegClass.contains(DestReg) &&
+      ARM64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (ARM64::QQQQRegClass.contains(DestReg) &&
+      ARM64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
+                                        ARM64::qsub2, ARM64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (ARM64::QQQRegClass.contains(DestReg) &&
+      ARM64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
+                                        ARM64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (ARM64::QQRegClass.contains(DestReg) &&
+      ARM64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  if (ARM64::FPR128RegClass.contains(DestReg) &&
+      ARM64::FPR128RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
+        SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (ARM64::FPR64RegClass.contains(DestReg) &&
+      ARM64::FPR64RegClass.contains(SrcReg)) {
+    DestReg =
+        RI.getMatchingSuperReg(DestReg, ARM64::dsub, &ARM64::FPR128RegClass);
+    SrcReg =
+        RI.getMatchingSuperReg(SrcReg, ARM64::dsub, &ARM64::FPR128RegClass);
+    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
+        SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (ARM64::FPR32RegClass.contains(DestReg) &&
+      ARM64::FPR32RegClass.contains(SrcReg)) {
+    DestReg =
+        RI.getMatchingSuperReg(DestReg, ARM64::ssub, &ARM64::FPR128RegClass);
+    SrcReg =
+        RI.getMatchingSuperReg(SrcReg, ARM64::ssub, &ARM64::FPR128RegClass);
+    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
+        SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (ARM64::FPR16RegClass.contains(DestReg) &&
+      ARM64::FPR16RegClass.contains(SrcReg)) {
+    DestReg =
+        RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR128RegClass);
+    SrcReg =
+        RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR128RegClass);
+    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
+        SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (ARM64::FPR8RegClass.contains(DestReg) &&
+      ARM64::FPR8RegClass.contains(SrcReg)) {
+    DestReg =
+        RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR128RegClass);
+    SrcReg =
+        RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR128RegClass);
+    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
+        SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (ARM64::FPR64RegClass.contains(DestReg) &&
+      ARM64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(ARM64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (ARM64::GPR64RegClass.contains(DestReg) &&
+      ARM64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(ARM64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (ARM64::FPR32RegClass.contains(DestReg) &&
+      ARM64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(ARM64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (ARM64::GPR32RegClass.contains(DestReg) &&
+      ARM64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(ARM64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  assert(0 && "unimplemented reg-to-reg copy");
+}
+
+void ARM64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         unsigned SrcReg, bool isKill, int FI,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = ARM64::STRBui;
+    break;
+  case 2:
+    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = ARM64::STRHui;
+    break;
+  case 4:
+    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = ARM64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
+      else
+        assert(SrcReg != ARM64::WSP);
+    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = ARM64::STRSui;
+    break;
+  case 8:
+    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = ARM64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
+      else
+        assert(SrcReg != ARM64::SP);
+    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = ARM64::STRDui;
+    break;
+  case 16:
+    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = ARM64::STRQui;
+    else if (ARM64::DDRegClass.hasSubClassEq(RC))
+      Opc = ARM64::ST1Twov1d, Offset = false;
+    break;
+  case 24:
+    if (ARM64::DDDRegClass.hasSubClassEq(RC))
+      Opc = ARM64::ST1Threev1d, Offset = false;
+    break;
+  case 32:
+    if (ARM64::DDDDRegClass.hasSubClassEq(RC))
+      Opc = ARM64::ST1Fourv1d, Offset = false;
+    else if (ARM64::QQRegClass.hasSubClassEq(RC))
+      Opc = ARM64::ST1Twov2d, Offset = false;
+    break;
+  case 48:
+    if (ARM64::QQQRegClass.hasSubClassEq(RC))
+      Opc = ARM64::ST1Threev2d, Offset = false;
+    break;
+  case 64:
+    if (ARM64::QQQQRegClass.hasSubClassEq(RC))
+      Opc = ARM64::ST1Fourv2d, Offset = false;
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
+
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void ARM64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          unsigned DestReg, int FI,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = ARM64::LDRBui;
+    break;
+  case 2:
+    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = ARM64::LDRHui;
+    break;
+  case 4:
+    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = ARM64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR32RegClass);
+      else
+        assert(DestReg != ARM64::WSP);
+    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = ARM64::LDRSui;
+    break;
+  case 8:
+    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = ARM64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR64RegClass);
+      else
+        assert(DestReg != ARM64::SP);
+    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = ARM64::LDRDui;
+    break;
+  case 16:
+    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = ARM64::LDRQui;
+    else if (ARM64::DDRegClass.hasSubClassEq(RC))
+      Opc = ARM64::LD1Twov1d, Offset = false;
+    break;
+  case 24:
+    if (ARM64::DDDRegClass.hasSubClassEq(RC))
+      Opc = ARM64::LD1Threev1d, Offset = false;
+    break;
+  case 32:
+    if (ARM64::DDDDRegClass.hasSubClassEq(RC))
+      Opc = ARM64::LD1Fourv1d, Offset = false;
+    else if (ARM64::QQRegClass.hasSubClassEq(RC))
+      Opc = ARM64::LD1Twov2d, Offset = false;
+    break;
+  case 48:
+    if (ARM64::QQQRegClass.hasSubClassEq(RC))
+      Opc = ARM64::LD1Threev2d, Offset = false;
+    break;
+  case 64:
+    if (ARM64::QQQQRegClass.hasSubClassEq(RC))
+      Opc = ARM64::LD1Fourv2d, Offset = false;
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const ARM64InstrInfo *TII, MachineInstr::MIFlag Flag,
+                           bool SetCPSR) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
+
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetCPSR)
+    Opc = isSub ? ARM64::SUBSXri : ARM64::ADDSXri;
+  else
+    Opc = isSub ? ARM64::SUBXri : ARM64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
+    }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
+
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
+      .setMIFlag(Flag);
+}
+
+MachineInstr *
+ARM64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI->isCopy()) {
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (SrcReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &ARM64::GPR64RegClass);
+      return 0;
+    }
+    if (DstReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
+      return 0;
+    }
+  }
+
+  // Cannot fold.
+  return 0;
+}
+
+int llvm::isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                  bool *OutUseUnscaledOp,
+                                  unsigned *OutUnscaledOp,
+                                  int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    assert(0 && "unhandled opcode in rewriteARM64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case ARM64::LD1Twov2d:
+  case ARM64::LD1Threev2d:
+  case ARM64::LD1Fourv2d:
+  case ARM64::LD1Twov1d:
+  case ARM64::LD1Threev1d:
+  case ARM64::LD1Fourv1d:
+  case ARM64::ST1Twov2d:
+  case ARM64::ST1Threev2d:
+  case ARM64::ST1Fourv2d:
+  case ARM64::ST1Twov1d:
+  case ARM64::ST1Threev1d:
+  case ARM64::ST1Fourv1d:
+    return ARM64FrameOffsetCannotUpdate;
+  case ARM64::PRFMui:
+    Scale = 8;
+    UnscaledOp = ARM64::PRFUMi;
+    break;
+  case ARM64::LDRXui:
+    Scale = 8;
+    UnscaledOp = ARM64::LDURXi;
+    break;
+  case ARM64::LDRWui:
+    Scale = 4;
+    UnscaledOp = ARM64::LDURWi;
+    break;
+  case ARM64::LDRBui:
+    Scale = 1;
+    UnscaledOp = ARM64::LDURBi;
+    break;
+  case ARM64::LDRHui:
+    Scale = 2;
+    UnscaledOp = ARM64::LDURHi;
+    break;
+  case ARM64::LDRSui:
+    Scale = 4;
+    UnscaledOp = ARM64::LDURSi;
+    break;
+  case ARM64::LDRDui:
+    Scale = 8;
+    UnscaledOp = ARM64::LDURDi;
+    break;
+  case ARM64::LDRQui:
+    Scale = 16;
+    UnscaledOp = ARM64::LDURQi;
+    break;
+  case ARM64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = ARM64::LDURBBi;
+    break;
+  case ARM64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = ARM64::LDURHHi;
+    break;
+  case ARM64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = ARM64::LDURSBXi;
+    break;
+  case ARM64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = ARM64::LDURSBWi;
+    break;
+  case ARM64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = ARM64::LDURSHXi;
+    break;
+  case ARM64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = ARM64::LDURSHWi;
+    break;
+  case ARM64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = ARM64::LDURSWi;
+    break;
+
+  case ARM64::STRXui:
+    Scale = 8;
+    UnscaledOp = ARM64::STURXi;
+    break;
+  case ARM64::STRWui:
+    Scale = 4;
+    UnscaledOp = ARM64::STURWi;
+    break;
+  case ARM64::STRBui:
+    Scale = 1;
+    UnscaledOp = ARM64::STURBi;
+    break;
+  case ARM64::STRHui:
+    Scale = 2;
+    UnscaledOp = ARM64::STURHi;
+    break;
+  case ARM64::STRSui:
+    Scale = 4;
+    UnscaledOp = ARM64::STURSi;
+    break;
+  case ARM64::STRDui:
+    Scale = 8;
+    UnscaledOp = ARM64::STURDi;
+    break;
+  case ARM64::STRQui:
+    Scale = 16;
+    UnscaledOp = ARM64::STURQi;
+    break;
+  case ARM64::STRBBui:
+    Scale = 1;
+    UnscaledOp = ARM64::STURBBi;
+    break;
+  case ARM64::STRHHui:
+    Scale = 2;
+    UnscaledOp = ARM64::STURHHi;
+    break;
+
+  case ARM64::LDPXi:
+  case ARM64::LDPDi:
+  case ARM64::STPXi:
+  case ARM64::STPDi:
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case ARM64::LDPQi:
+  case ARM64::STPQi:
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case ARM64::LDPWi:
+  case ARM64::LDPSi:
+  case ARM64::STPWi:
+  case ARM64::STPSi:
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case ARM64::LDURXi:
+  case ARM64::LDURWi:
+  case ARM64::LDURBi:
+  case ARM64::LDURHi:
+  case ARM64::LDURSi:
+  case ARM64::LDURDi:
+  case ARM64::LDURQi:
+  case ARM64::LDURHHi:
+  case ARM64::LDURBBi:
+  case ARM64::LDURSBXi:
+  case ARM64::LDURSBWi:
+  case ARM64::LDURSHXi:
+  case ARM64::LDURSHWi:
+  case ARM64::LDURSWi:
+  case ARM64::STURXi:
+  case ARM64::STURWi:
+  case ARM64::STURBi:
+  case ARM64::STURHi:
+  case ARM64::STURSi:
+  case ARM64::STURDi:
+  case ARM64::STURQi:
+  case ARM64::STURBBi:
+  case ARM64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return ARM64FrameOffsetCanUpdate |
+         (Offset == 0 ? ARM64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                  unsigned FrameReg, int &Offset,
+                                  const ARM64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == ARM64::ADDSXri || Opcode == ARM64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == ARM64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isARM64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp,
+                                       &NewOffset);
+  if (Status & ARM64FrameOffsetCanUpdate) {
+    if (Status & ARM64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void ARM64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM64::HINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h
new file mode 100644
index 0000000..2591ca0
--- /dev/null
+++ b/lib/Target/ARM64/ARM64InstrInfo.h
@@ -0,0 +1,219 @@
+//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64INSTRINFO_H
+#define LLVM_TARGET_ARM64INSTRINFO_H
+
+#include "ARM64.h"
+#include "ARM64RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "ARM64GenInstrInfo.inc"
+
+namespace llvm {
+
+class ARM64Subtarget;
+class ARM64TargetMachine;
+
+class ARM64InstrInfo : public ARM64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
+  const ARM64RegisterInfo RI;
+  const ARM64Subtarget &Subtarget;
+
+public:
+  explicit ARM64InstrInfo(const ARM64Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  const ARM64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                            unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool enableClusterLoads() const override { return true; }
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr *First,
+                              MachineInstr *Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool canInsertSelect(const MachineBasicBlock &,
+                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+                       unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    DebugLoc DL, unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const ARM64InstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetCPSR = false);
+
+/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const ARM64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal.
+enum ARM64FrameOffsetStatus {
+  ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  ARM64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  ARM64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
+
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by ARM64FrameOffsetStatus for that.
+/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = NULL,
+                            unsigned *OutUnscaledOp = NULL,
+                            int *EmittableOffset = NULL);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case ARM64::Bcc:
+  case ARM64::CBZW:
+  case ARM64::CBZX:
+  case ARM64::CBNZW:
+  case ARM64::CBNZX:
+  case ARM64::TBZ:
+  case ARM64::TBNZ:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; }
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
new file mode 100644
index 0000000..2fe1720
--- /dev/null
+++ b/lib/Target/ARM64/ARM64InstrInfo.td
@@ -0,0 +1,4458 @@
+//===- ARM64InstrInfo.td - Describe the ARM64 Instructions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ARM64 Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM64-specific DAG Nodes.
+//
+
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_ARM64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_ARM64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_ARM64tbz : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisVT<1, i64>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_ARM64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_ARM64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_ARM64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_ARM64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_ARM64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_ARM64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_ARM64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_ARM64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_ARM64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_ARM64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_ARM64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_ARM64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_ARM64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_ARM64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_ARM64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_ARM64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+def SDT_ARM64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def ARM64adrp          : SDNode<"ARM64ISD::ADRP", SDTIntUnaryOp, []>;
+def ARM64addlow        : SDNode<"ARM64ISD::ADDlow", SDTIntBinOp, []>;
+def ARM64LOADgot       : SDNode<"ARM64ISD::LOADgot", SDTIntUnaryOp>;
+def ARM64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def ARM64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def ARM64call          : SDNode<"ARM64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def ARM64brcond        : SDNode<"ARM64ISD::BRCOND", SDT_ARM64Brcond,
+                                [SDNPHasChain]>;
+def ARM64cbz           : SDNode<"ARM64ISD::CBZ", SDT_ARM64cbz,
+                                [SDNPHasChain]>;
+def ARM64cbnz           : SDNode<"ARM64ISD::CBNZ", SDT_ARM64cbz,
+                                [SDNPHasChain]>;
+def ARM64tbz           : SDNode<"ARM64ISD::TBZ", SDT_ARM64tbz,
+                                [SDNPHasChain]>;
+def ARM64tbnz           : SDNode<"ARM64ISD::TBNZ", SDT_ARM64tbz,
+                                [SDNPHasChain]>;
+
+
+def ARM64csel          : SDNode<"ARM64ISD::CSEL", SDT_ARM64CSel>;
+def ARM64csinv         : SDNode<"ARM64ISD::CSINV", SDT_ARM64CSel>;
+def ARM64csneg         : SDNode<"ARM64ISD::CSNEG", SDT_ARM64CSel>;
+def ARM64csinc         : SDNode<"ARM64ISD::CSINC", SDT_ARM64CSel>;
+def ARM64retflag       : SDNode<"ARM64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARM64adc       : SDNode<"ARM64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def ARM64sbc       : SDNode<"ARM64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def ARM64add_flag  : SDNode<"ARM64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def ARM64sub_flag  : SDNode<"ARM64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def ARM64and_flag  : SDNode<"ARM64ISD::ANDS",  SDTBinaryArithWithFlagsOut>;
+def ARM64adc_flag  : SDNode<"ARM64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def ARM64sbc_flag  : SDNode<"ARM64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def ARM64threadpointer : SDNode<"ARM64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def ARM64fcmp      : SDNode<"ARM64ISD::FCMP", SDT_ARM64FCmp>;
+
+def ARM64fmax      : SDNode<"ARM64ISD::FMAX", SDTFPBinOp>;
+def ARM64fmin      : SDNode<"ARM64ISD::FMIN", SDTFPBinOp>;
+
+def ARM64dup       : SDNode<"ARM64ISD::DUP", SDT_ARM64Dup>;
+def ARM64duplane8  : SDNode<"ARM64ISD::DUPLANE8", SDT_ARM64DupLane>;
+def ARM64duplane16 : SDNode<"ARM64ISD::DUPLANE16", SDT_ARM64DupLane>;
+def ARM64duplane32 : SDNode<"ARM64ISD::DUPLANE32", SDT_ARM64DupLane>;
+def ARM64duplane64 : SDNode<"ARM64ISD::DUPLANE64", SDT_ARM64DupLane>;
+
+def ARM64zip1      : SDNode<"ARM64ISD::ZIP1", SDT_ARM64Zip>;
+def ARM64zip2      : SDNode<"ARM64ISD::ZIP2", SDT_ARM64Zip>;
+def ARM64uzp1      : SDNode<"ARM64ISD::UZP1", SDT_ARM64Zip>;
+def ARM64uzp2      : SDNode<"ARM64ISD::UZP2", SDT_ARM64Zip>;
+def ARM64trn1      : SDNode<"ARM64ISD::TRN1", SDT_ARM64Zip>;
+def ARM64trn2      : SDNode<"ARM64ISD::TRN2", SDT_ARM64Zip>;
+
+def ARM64movi_edit : SDNode<"ARM64ISD::MOVIedit", SDT_ARM64MOVIedit>;
+def ARM64movi_shift : SDNode<"ARM64ISD::MOVIshift", SDT_ARM64MOVIshift>;
+def ARM64movi_msl : SDNode<"ARM64ISD::MOVImsl", SDT_ARM64MOVIshift>;
+def ARM64mvni_shift : SDNode<"ARM64ISD::MVNIshift", SDT_ARM64MOVIshift>;
+def ARM64mvni_msl : SDNode<"ARM64ISD::MVNImsl", SDT_ARM64MOVIshift>;
+def ARM64movi : SDNode<"ARM64ISD::MOVI", SDT_ARM64MOVIedit>;
+def ARM64fmov : SDNode<"ARM64ISD::FMOV", SDT_ARM64MOVIedit>;
+
+def ARM64rev16 : SDNode<"ARM64ISD::REV16", SDT_ARM64UnaryVec>;
+def ARM64rev32 : SDNode<"ARM64ISD::REV32", SDT_ARM64UnaryVec>;
+def ARM64rev64 : SDNode<"ARM64ISD::REV64", SDT_ARM64UnaryVec>;
+def ARM64ext : SDNode<"ARM64ISD::EXT", SDT_ARM64ExtVec>;
+
+def ARM64vashr : SDNode<"ARM64ISD::VASHR", SDT_ARM64vshift>;
+def ARM64vlshr : SDNode<"ARM64ISD::VLSHR", SDT_ARM64vshift>;
+def ARM64vshl : SDNode<"ARM64ISD::VSHL", SDT_ARM64vshift>;
+def ARM64sqshli : SDNode<"ARM64ISD::SQSHL_I", SDT_ARM64vshift>;
+def ARM64uqshli : SDNode<"ARM64ISD::UQSHL_I", SDT_ARM64vshift>;
+def ARM64sqshlui : SDNode<"ARM64ISD::SQSHLU_I", SDT_ARM64vshift>;
+def ARM64srshri : SDNode<"ARM64ISD::SRSHR_I", SDT_ARM64vshift>;
+def ARM64urshri : SDNode<"ARM64ISD::URSHR_I", SDT_ARM64vshift>;
+
+def ARM64not: SDNode<"ARM64ISD::NOT", SDT_ARM64unvec>;
+def ARM64bit: SDNode<"ARM64ISD::BIT", SDT_ARM64trivec>;
+
+def ARM64cmeq: SDNode<"ARM64ISD::CMEQ", SDT_ARM64binvec>;
+def ARM64cmge: SDNode<"ARM64ISD::CMGE", SDT_ARM64binvec>;
+def ARM64cmgt: SDNode<"ARM64ISD::CMGT", SDT_ARM64binvec>;
+def ARM64cmhi: SDNode<"ARM64ISD::CMHI", SDT_ARM64binvec>;
+def ARM64cmhs: SDNode<"ARM64ISD::CMHS", SDT_ARM64binvec>;
+
+def ARM64fcmeq: SDNode<"ARM64ISD::FCMEQ", SDT_ARM64fcmp>;
+def ARM64fcmge: SDNode<"ARM64ISD::FCMGE", SDT_ARM64fcmp>;
+def ARM64fcmgt: SDNode<"ARM64ISD::FCMGT", SDT_ARM64fcmp>;
+
+def ARM64cmeqz: SDNode<"ARM64ISD::CMEQz", SDT_ARM64unvec>;
+def ARM64cmgez: SDNode<"ARM64ISD::CMGEz", SDT_ARM64unvec>;
+def ARM64cmgtz: SDNode<"ARM64ISD::CMGTz", SDT_ARM64unvec>;
+def ARM64cmlez: SDNode<"ARM64ISD::CMLEz", SDT_ARM64unvec>;
+def ARM64cmltz: SDNode<"ARM64ISD::CMLTz", SDT_ARM64unvec>;
+def ARM64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                         (ARM64not (ARM64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def ARM64fcmeqz: SDNode<"ARM64ISD::FCMEQz", SDT_ARM64fcmpz>;
+def ARM64fcmgez: SDNode<"ARM64ISD::FCMGEz", SDT_ARM64fcmpz>;
+def ARM64fcmgtz: SDNode<"ARM64ISD::FCMGTz", SDT_ARM64fcmpz>;
+def ARM64fcmlez: SDNode<"ARM64ISD::FCMLEz", SDT_ARM64fcmpz>;
+def ARM64fcmltz: SDNode<"ARM64ISD::FCMLTz", SDT_ARM64fcmpz>;
+
+def ARM64bici: SDNode<"ARM64ISD::BICi", SDT_ARM64vecimm>;
+def ARM64orri: SDNode<"ARM64ISD::ORRi", SDT_ARM64vecimm>;
+
+def ARM64neg : SDNode<"ARM64ISD::NEG", SDT_ARM64unvec>;
+
+def ARM64tcret: SDNode<"ARM64ISD::TC_RETURN", SDT_ARM64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def ARM64Prefetch        : SDNode<"ARM64ISD::PREFETCH", SDT_ARM64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def ARM64sitof: SDNode<"ARM64ISD::SITOF", SDT_ARM64ITOF>;
+def ARM64uitof: SDNode<"ARM64ISD::UITOF", SDT_ARM64ITOF>;
+
+def ARM64tlsdesc_call : SDNode<"ARM64ISD::TLSDESC_CALL", SDT_ARM64TLSDescCall,
+                               [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                SDNPVariadic]>;
+
+def ARM64WrapperLarge : SDNode<"ARM64ISD::WrapperLarge", SDT_ARM64WrapperLarge>;
+
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// ARM64 Instruction Predicate Definitions.
+//
+def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
+
+include "ARM64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous instructions.
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(ARM64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(ARM64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the ARM64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (ARM64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (ARM64addlow (ARM64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(ARM64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(ARM64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(ARM64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
+
+def HINT  : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
+
+  // As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
+
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
+
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRcpsr: MSRcpsrI;
+
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(ARM64threadpointer), (MRS 0xde82)>;
+
+// Generic system instructions
+def SYS    : SystemI<0, "sys">;
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
+
+//===----------------------------------------------------------------------===//
+// Move immediate instructions.
+//===----------------------------------------------------------------------===//
+
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
+
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
+
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(ARM64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(ARM64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(ARM64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
+
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", ARM64adc, ARM64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", ARM64sbc, ARM64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+defm ADDS : AddSubS<0, "adds", ARM64add_flag>;
+defm SUBS : AddSubS<1, "subs", ARM64sub_flag>;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
+def : InstAlias<"neg $dst, $src, $shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift:$shift)>;
+def : InstAlias<"neg $dst, $src, $shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift:$shift)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(ARM64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(ARM64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(ARM64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(ARM64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
+def : InstAlias<"negs $dst, $src, $shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift:$shift)>;
+def : InstAlias<"negs $dst, $src, $shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift:$shift)>;
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_arm64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_arm64_sdiv>;
+}
+
+// Variable shift
+defm ASRV : Shift<0b10, "asrv", sra>;
+defm LSLV : Shift<0b00, "lslv", shl>;
+defm LSRV : Shift<0b01, "lsrv", srl>;
+defm RORV : Shift<0b11, "rorv", rotr>;
+
+def : ShiftAlias<"asr", ASRVWr, GPR32>;
+def : ShiftAlias<"asr", ASRVXr, GPR64>;
+def : ShiftAlias<"lsl", LSLVWr, GPR32>;
+def : ShiftAlias<"lsl", LSLVXr, GPR64>;
+def : ShiftAlias<"lsr", LSRVWr, GPR32>;
+def : ShiftAlias<"lsr", LSRVXr, GPR64>;
+def : ShiftAlias<"ror", RORVWr, GPR32>;
+def : ShiftAlias<"ror", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_arm64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_arm64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_arm64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_arm64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_arm64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_arm64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_arm64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_arm64_crc32cx, "crc32cx">;
+
+
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
+
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", ARM64and_flag>;
+defm AND  : LogicalImm<0b00, "and", and>;
+defm EOR  : LogicalImm<0b10, "eor", xor>;
+defm ORR  : LogicalImm<0b01, "orr", or>;
+
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm)>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm)>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands">;
+defm BICS : LogicalRegS<0b11, 1, "bics">;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sp:$src, 0, 0)>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2)>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2)>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0)>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0)>;
+
+def : InstAlias<"tst $src1, $src2, $sh",
+                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift:$sh)>;
+def : InstAlias<"tst $src1, $src2, $sh",
+                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift:$sh)>;
+
+def : InstAlias<"mvn $Wd, $Wm",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0)>;
+def : InstAlias<"mvn $Xd, $Xm",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0)>;
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
+
+
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
+
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield immediate extraction instruction.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
+}
+
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
+
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+//===----------------------------------------------------------------------===//
+// Conditionally set flags instructions.
+//===----------------------------------------------------------------------===//
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
+
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(ARM64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(ARM64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(ARM64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(ARM64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(ARM64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(ARM64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(ARM64csel (i32 0), (i32 1), (i32 imm:$cc), CPSR),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(ARM64csel (i64 0), (i64 1), (i32 imm:$cc), CPSR),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(ARM64csel (i32 0), (i32 -1), (i32 imm:$cc), CPSR),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(ARM64csel (i64 0), (i64 -1), (i32 imm:$cc), CPSR),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+// FIXME: Is this the correct way to handle these aliases?
+def : InstAlias<"cset $dst, $cc", (CSINCWr GPR32:$dst, WZR, WZR, ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc", (CSINCXr GPR64:$dst, XZR, XZR, ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc", (CSINVWr GPR32:$dst, WZR, WZR, ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc", (CSINVXr GPR64:$dst, XZR, XZR, ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
+
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (ARM64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
+
+// page address of a constant pool entry, block address
+def : Pat<(ARM64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(ARM64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
+
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(ARM64call GPR64:$Rn)]>;
+} // isCall
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
+
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(ARM64retflag)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+  let AsmString = ".tlsdesccall $sym";
+}
+
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+             [(ARM64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
+
+def : Pat<(ARM64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instruction.
+//===----------------------------------------------------------------------===//
+def Bcc : BranchCond;
+
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ  : CmpBranch<0, "cbz", ARM64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", ARM64cbnz>;
+
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+def TBZ  : TestBranch<0, "tbz", ARM64tbz>;
+def TBNZ : TestBranch<1, "tbnz", ARM64tbnz>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions.
+//===----------------------------------------------------------------------===//
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(ARM64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(ARM64call texternalsym:$func), (BL texternalsym:$func)>;
+
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+def LDPWi : LoadPairOffset<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
+def LDPXi : LoadPairOffset<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
+def LDPSi : LoadPairOffset<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
+def LDPDi : LoadPairOffset<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
+def LDPQi : LoadPairOffset<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
+
+def LDPSWi : LoadPairOffset<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+def LDNPWi : LoadPairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "ldnp">;
+def LDNPXi : LoadPairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "ldnp">;
+def LDNPSi : LoadPairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "ldnp">;
+def LDNPDi : LoadPairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "ldnp">;
+def LDNPQi : LoadPairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+let AddedComplexity = 10 in {
+// Integer
+def LDRBBro : Load8RO<0b00,  0, 0b01, GPR32, "ldrb",
+                      [(set GPR32:$Rt, (zextloadi8 ro_indexed8:$addr))]>;
+def LDRHHro : Load16RO<0b01, 0, 0b01, GPR32, "ldrh",
+                      [(set GPR32:$Rt, (zextloadi16 ro_indexed16:$addr))]>;
+def LDRWro  : Load32RO<0b10,   0, 0b01, GPR32, "ldr",
+                      [(set GPR32:$Rt, (load ro_indexed32:$addr))]>;
+def LDRXro  : Load64RO<0b11,   0, 0b01, GPR64, "ldr",
+                      [(set GPR64:$Rt, (load ro_indexed64:$addr))]>;
+
+// Floating-point
+def LDRBro : Load8RO<0b00,   1, 0b01, FPR8,   "ldr",
+                      [(set FPR8:$Rt, (load ro_indexed8:$addr))]>;
+def LDRHro : Load16RO<0b01,  1, 0b01, FPR16,  "ldr",
+                      [(set FPR16:$Rt, (load ro_indexed16:$addr))]>;
+def LDRSro : Load32RO<0b10,    1, 0b01, FPR32,  "ldr",
+                      [(set (f32 FPR32:$Rt), (load ro_indexed32:$addr))]>;
+def LDRDro : Load64RO<0b11,    1, 0b01, FPR64,  "ldr",
+                      [(set (f64 FPR64:$Rt), (load ro_indexed64:$addr))]>;
+def LDRQro : Load128RO<0b00,    1, 0b11, FPR128, "ldr", []> {
+  let mayLoad = 1;
+}
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBro ro_indexed8:$addr), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBro ro_indexed8:$addr), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHro ro_indexed16:$addr), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHro ro_indexed16:$addr), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSro ro_indexed32:$addr), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSro ro_indexed32:$addr), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
+           (LDRDro ro_indexed64:$addr)>;
+def : Pat <(v2i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDro ro_indexed64:$addr), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+def : Pat<(v2f32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
+def : Pat<(v1f64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
+def : Pat<(v8i8 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
+def : Pat<(v4i16 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
+def : Pat<(v2i32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
+def : Pat<(v1i64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+def : Pat<(v4f32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+def : Pat<(v2f64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+def : Pat<(v16i8 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+def : Pat<(v8i16 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+def : Pat<(v4i32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+def : Pat<(v2i64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+def : Pat<(f128  (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
+
+// Load sign-extended half-word
+def LDRSHWro : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh",
+                      [(set GPR32:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
+def LDRSHXro : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh",
+                      [(set GPR64:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
+
+// Load sign-extended byte
+def LDRSBWro : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb",
+                      [(set GPR32:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
+def LDRSBXro : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb",
+                      [(set GPR64:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
+
+// Load sign-extended word
+def LDRSWro  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw",
+                      [(set GPR64:$Rt, (sextloadi32 ro_indexed32:$addr))]>;
+
+// Pre-fetch.
+def PRFMro : PrefetchRO<0b11, 0, 0b10, "prfm",
+                        [(ARM64Prefetch imm:$Rt, ro_indexed64:$addr)]>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 ro_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
+def : Pat<(i64 (zextloadi16 ro_indexed16:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
+def : Pat<(i64 (zextloadi1 ro_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 ro_indexed16:$addr)), (LDRHHro ro_indexed16:$addr)>;
+def : Pat<(i32 (extloadi8 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
+def : Pat<(i32 (extloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
+def : Pat<(i64 (extloadi32 ro_indexed32:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRWro ro_indexed32:$addr), sub_32)>;
+def : Pat<(i64 (extloadi16 ro_indexed16:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
+def : Pat<(i64 (extloadi8 ro_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
+def : Pat<(i64 (extloadi1 ro_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+//---
+def LDRXui : LoadUI<0b11, 0, 0b01, GPR64, am_indexed64, "ldr",
+                    [(set GPR64:$Rt, (load am_indexed64:$addr))]>;
+def LDRWui : LoadUI<0b10, 0, 0b01, GPR32, am_indexed32, "ldr",
+                    [(set GPR32:$Rt, (load am_indexed32:$addr))]>;
+def LDRBui : LoadUI<0b00, 1, 0b01, FPR8, am_indexed8, "ldr",
+                    [(set FPR8:$Rt, (load am_indexed8:$addr))]>;
+def LDRHui : LoadUI<0b01, 1, 0b01, FPR16, am_indexed16, "ldr",
+                    [(set FPR16:$Rt, (load am_indexed16:$addr))]>;
+def LDRSui : LoadUI<0b10, 1, 0b01, FPR32, am_indexed32, "ldr",
+                    [(set (f32 FPR32:$Rt), (load am_indexed32:$addr))]>;
+def LDRDui : LoadUI<0b11, 1, 0b01, FPR64, am_indexed64, "ldr",
+                    [(set (f64 FPR64:$Rt), (load am_indexed64:$addr))]>;
+def LDRQui : LoadUI<0b00, 1, 0b11, FPR128, am_indexed128, "ldr",
+                    [(set (f128 FPR128:$Rt), (load am_indexed128:$addr))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui am_indexed8:$addr), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui am_indexed8:$addr), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui am_indexed16:$addr), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui am_indexed16:$addr), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui am_indexed32:$addr), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui am_indexed32:$addr), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
+           (LDRDui am_indexed64:$addr)>;
+def : Pat <(v2i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui am_indexed64:$addr), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+def : Pat<(v2f32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
+def : Pat<(v1f64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
+def : Pat<(v8i8 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
+def : Pat<(v4i16 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
+def : Pat<(v2i32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
+def : Pat<(v1i64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+def : Pat<(v4f32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+def : Pat<(v2f64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+def : Pat<(v16i8 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+def : Pat<(v8i16 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+def : Pat<(v4i32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+def : Pat<(v2i64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+def : Pat<(f128  (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
+
+def LDRHHui : LoadUI<0b01, 0, 0b01, GPR32, am_indexed16, "ldrh",
+                     [(set GPR32:$Rt, (zextloadi16 am_indexed16:$addr))]>;
+def LDRBBui : LoadUI<0b00, 0, 0b01, GPR32, am_indexed8, "ldrb",
+                     [(set GPR32:$Rt, (zextloadi8 am_indexed8:$addr))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 am_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
+def : Pat<(i64 (zextloadi16 am_indexed16:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
+def : Pat<(i64 (zextloadi1 am_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 am_indexed16:$addr)), (LDRHHui am_indexed16:$addr)>;
+def : Pat<(i32 (extloadi8 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
+def : Pat<(i32 (extloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
+def : Pat<(i64 (extloadi32 am_indexed32:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
+def : Pat<(i64 (extloadi16 am_indexed16:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
+def : Pat<(i64 (extloadi8 am_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
+def : Pat<(i64 (extloadi1 am_indexed8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
+
+// load sign-extended half-word
+def LDRSHWui : LoadUI<0b01, 0, 0b11, GPR32, am_indexed16, "ldrsh",
+                      [(set GPR32:$Rt, (sextloadi16 am_indexed16:$addr))]>;
+def LDRSHXui : LoadUI<0b01, 0, 0b10, GPR64, am_indexed16, "ldrsh",
+                      [(set GPR64:$Rt, (sextloadi16 am_indexed16:$addr))]>;
+
+// load sign-extended byte
+def LDRSBWui : LoadUI<0b00, 0, 0b11, GPR32, am_indexed8, "ldrsb",
+                      [(set GPR32:$Rt, (sextloadi8 am_indexed8:$addr))]>;
+def LDRSBXui : LoadUI<0b00, 0, 0b10, GPR64, am_indexed8, "ldrsb",
+                      [(set GPR64:$Rt, (sextloadi8 am_indexed8:$addr))]>;
+
+// load sign-extended word
+def LDRSWui  : LoadUI<0b10, 0, 0b10, GPR64, am_indexed32, "ldrsw",
+                      [(set GPR64:$Rt, (sextloadi32 am_indexed32:$addr))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 am_indexed32:$addr)),
+ (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(ARM64Prefetch imm:$Rt, am_indexed64:$addr)]>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(ARM64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+def LDURXi : LoadUnscaled<0b11, 0, 0b01, GPR64, am_unscaled64, "ldur",
+                          [(set GPR64:$Rt, (load am_unscaled64:$addr))]>;
+def LDURWi : LoadUnscaled<0b10, 0, 0b01, GPR32, am_unscaled32, "ldur",
+                          [(set GPR32:$Rt, (load am_unscaled32:$addr))]>;
+def LDURBi : LoadUnscaled<0b00, 1, 0b01, FPR8,  am_unscaled8, "ldur",
+                          [(set FPR8:$Rt, (load am_unscaled8:$addr))]>;
+def LDURHi : LoadUnscaled<0b01, 1, 0b01, FPR16, am_unscaled16, "ldur",
+                          [(set FPR16:$Rt, (load am_unscaled16:$addr))]>;
+def LDURSi : LoadUnscaled<0b10, 1, 0b01, FPR32, am_unscaled32, "ldur",
+                          [(set (f32 FPR32:$Rt), (load am_unscaled32:$addr))]>;
+def LDURDi : LoadUnscaled<0b11, 1, 0b01, FPR64, am_unscaled64, "ldur",
+                          [(set (f64 FPR64:$Rt), (load am_unscaled64:$addr))]>;
+def LDURQi : LoadUnscaled<0b00, 1, 0b11, FPR128, am_unscaled128, "ldur",
+                        [(set (v2f64 FPR128:$Rt), (load am_unscaled128:$addr))]>;
+
+def LDURHHi
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, am_unscaled16, "ldurh",
+                   [(set GPR32:$Rt, (zextloadi16 am_unscaled16:$addr))]>;
+def LDURBBi
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, am_unscaled8, "ldurb",
+                   [(set GPR32:$Rt, (zextloadi8 am_unscaled8:$addr))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+def : Pat<(v2f32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
+def : Pat<(v1f64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
+def : Pat<(v8i8 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
+def : Pat<(v4i16 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
+def : Pat<(v2i32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
+def : Pat<(v1i64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+def : Pat<(v4f32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+def : Pat<(v2f64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+def : Pat<(v16i8 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+def : Pat<(v8i16 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+def : Pat<(v4i32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+def : Pat<(v2i64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+def : Pat<(f128  (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 am_unscaled16:$addr)), (LDURHHi am_unscaled16:$addr)>;
+def : Pat<(i32 (extloadi8 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
+def : Pat<(i32 (extloadi1 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
+def : Pat<(i64 (extloadi32 am_unscaled32:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
+def : Pat<(i64 (extloadi16 am_unscaled16:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
+def : Pat<(i64 (extloadi8 am_unscaled8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
+def : Pat<(i64 (extloadi1 am_unscaled8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 am_unscaled16:$addr)),
+    (LDURHHi am_unscaled16:$addr)>;
+def : Pat<(i32 (zextloadi8 am_unscaled8:$addr)),
+    (LDURBBi am_unscaled8:$addr)>;
+def : Pat<(i32 (zextloadi1 am_unscaled8:$addr)),
+    (LDURBBi am_unscaled8:$addr)>;
+def : Pat<(i64 (zextloadi32 am_unscaled32:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
+def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
+def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
+def : Pat<(i64 (zextloadi1 am_unscaled8:$addr)),
+    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+def MemoryUnscaledFB8Operand : AsmOperandClass {
+  let Name = "MemoryUnscaledFB8";
+  let RenderMethod = "addMemoryUnscaledOperands";
+}
+def MemoryUnscaledFB16Operand : AsmOperandClass {
+  let Name = "MemoryUnscaledFB16";
+  let RenderMethod = "addMemoryUnscaledOperands";
+}
+def MemoryUnscaledFB32Operand : AsmOperandClass {
+  let Name = "MemoryUnscaledFB32";
+  let RenderMethod = "addMemoryUnscaledOperands";
+}
+def MemoryUnscaledFB64Operand : AsmOperandClass {
+  let Name = "MemoryUnscaledFB64";
+  let RenderMethod = "addMemoryUnscaledOperands";
+}
+def MemoryUnscaledFB128Operand : AsmOperandClass {
+  let Name = "MemoryUnscaledFB128";
+  let RenderMethod = "addMemoryUnscaledOperands";
+}
+def am_unscaled_fb8 : Operand<i64> {
+  let ParserMatchClass = MemoryUnscaledFB8Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+def am_unscaled_fb16 : Operand<i64> {
+  let ParserMatchClass = MemoryUnscaledFB16Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+def am_unscaled_fb32 : Operand<i64> {
+  let ParserMatchClass = MemoryUnscaledFB32Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+def am_unscaled_fb64 : Operand<i64> {
+  let ParserMatchClass = MemoryUnscaledFB64Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+def am_unscaled_fb128 : Operand<i64> {
+  let ParserMatchClass = MemoryUnscaledFB128Operand;
+  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
+}
+def : InstAlias<"ldr $Rt, $addr", (LDURXi GPR64:$Rt, am_unscaled_fb64:$addr)>;
+def : InstAlias<"ldr $Rt, $addr", (LDURWi GPR32:$Rt, am_unscaled_fb32:$addr)>;
+def : InstAlias<"ldr $Rt, $addr", (LDURBi FPR8:$Rt, am_unscaled_fb8:$addr)>;
+def : InstAlias<"ldr $Rt, $addr", (LDURHi FPR16:$Rt, am_unscaled_fb16:$addr)>;
+def : InstAlias<"ldr $Rt, $addr", (LDURSi FPR32:$Rt, am_unscaled_fb32:$addr)>;
+def : InstAlias<"ldr $Rt, $addr", (LDURDi FPR64:$Rt, am_unscaled_fb64:$addr)>;
+def : InstAlias<"ldr $Rt, $addr", (LDURQi FPR128:$Rt, am_unscaled_fb128:$addr)>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
+  (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
+def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
+  (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
+
+// load sign-extended half-word
+def LDURSHWi
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, am_unscaled16, "ldursh",
+                   [(set GPR32:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
+def LDURSHXi
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, am_unscaled16, "ldursh",
+                   [(set GPR64:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
+
+// load sign-extended byte
+def LDURSBWi
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, am_unscaled8, "ldursb",
+                   [(set GPR32:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
+def LDURSBXi
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, am_unscaled8, "ldursb",
+                   [(set GPR64:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
+
+// load sign-extended word
+def LDURSWi
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, am_unscaled32, "ldursw",
+                   [(set GPR64:$Rt, (sextloadi32 am_unscaled32:$addr))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, $addr", (LDURBBi GPR32:$Rt, am_unscaled_fb8:$addr)>;
+def : InstAlias<"ldrh $Rt, $addr", (LDURHHi GPR32:$Rt, am_unscaled_fb16:$addr)>;
+def : InstAlias<"ldrsb $Rt, $addr", (LDURSBWi GPR32:$Rt, am_unscaled_fb8:$addr)>;
+def : InstAlias<"ldrsb $Rt, $addr", (LDURSBXi GPR64:$Rt, am_unscaled_fb8:$addr)>;
+def : InstAlias<"ldrsh $Rt, $addr", (LDURSHWi GPR32:$Rt, am_unscaled_fb16:$addr)>;
+def : InstAlias<"ldrsh $Rt, $addr", (LDURSHXi GPR64:$Rt, am_unscaled_fb16:$addr)>;
+def : InstAlias<"ldrsw $Rt, $addr", (LDURSWi GPR64:$Rt, am_unscaled_fb32:$addr)>;
+
+// Pre-fetch.
+def PRFUMi : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                               [(ARM64Prefetch imm:$Rt, am_unscaled64:$addr)]>;
+
+//---
+// (unscaled immediate, unprivileged)
+def LDTRXi : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+def LDTRWi : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+def LDTRHi : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+def LDTRBi : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+def LDTRSHWi : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+def LDTRSHXi : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+def LDTRSBWi : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+def LDTRSBXi : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+def LDTRSWi  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+// ISel pseudos and patterns. See expanded comment on LoadPreIdxPseudo.
+def LDRDpre_isel  : LoadPreIdxPseudo<FPR64>;
+def LDRSpre_isel  : LoadPreIdxPseudo<FPR32>;
+def LDRXpre_isel  : LoadPreIdxPseudo<GPR64>;
+def LDRWpre_isel  : LoadPreIdxPseudo<GPR32>;
+def LDRHHpre_isel : LoadPreIdxPseudo<GPR32>;
+def LDRBBpre_isel : LoadPreIdxPseudo<GPR32>;
+
+def LDRSWpre_isel : LoadPreIdxPseudo<GPR64>;
+def LDRSHWpre_isel : LoadPreIdxPseudo<GPR32>;
+def LDRSHXpre_isel : LoadPreIdxPseudo<GPR64>;
+def LDRSBWpre_isel : LoadPreIdxPseudo<GPR32>;
+def LDRSBXpre_isel : LoadPreIdxPseudo<GPR64>;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+// ISel pseudos and patterns. See expanded comment on LoadPostIdxPseudo.
+def LDRDpost_isel  : LoadPostIdxPseudo<FPR64>;
+def LDRSpost_isel  : LoadPostIdxPseudo<FPR32>;
+def LDRXpost_isel  : LoadPostIdxPseudo<GPR64>;
+def LDRWpost_isel  : LoadPostIdxPseudo<GPR32>;
+def LDRHHpost_isel : LoadPostIdxPseudo<GPR32>;
+def LDRBBpost_isel : LoadPostIdxPseudo<GPR32>;
+
+def LDRSWpost_isel : LoadPostIdxPseudo<GPR64>;
+def LDRSHWpost_isel : LoadPostIdxPseudo<GPR32>;
+def LDRSHXpost_isel : LoadPostIdxPseudo<GPR64>;
+def LDRSBWpost_isel : LoadPostIdxPseudo<GPR32>;
+def LDRSBXpost_isel : LoadPostIdxPseudo<GPR64>;
+
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+def STPWi : StorePairOffset<0b00, 0, GPR32, am_indexed32simm7, "stp">;
+def STPXi : StorePairOffset<0b10, 0, GPR64, am_indexed64simm7, "stp">;
+def STPSi : StorePairOffset<0b00, 1, FPR32, am_indexed32simm7, "stp">;
+def STPDi : StorePairOffset<0b01, 1, FPR64, am_indexed64simm7, "stp">;
+def STPQi : StorePairOffset<0b10, 1, FPR128, am_indexed128simm7, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, am_indexed32simm7, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, am_indexed64simm7, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, am_indexed32simm7, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, am_indexed64simm7, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, am_indexed128simm7, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+def STNPWi : StorePairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "stnp">;
+def STNPXi : StorePairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "stnp">;
+def STNPSi : StorePairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "stnp">;
+def STNPDi : StorePairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "stnp">;
+def STNPQi : StorePairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "stnp">;
+
+//---
+// (Register offset)
+
+let AddedComplexity = 10 in {
+
+// Integer
+def STRHHro : Store16RO<0b01, 0, 0b00, GPR32, "strh",
+                            [(truncstorei16 GPR32:$Rt, ro_indexed16:$addr)]>;
+def STRBBro : Store8RO<0b00,  0, 0b00, GPR32, "strb",
+                            [(truncstorei8 GPR32:$Rt, ro_indexed8:$addr)]>;
+def STRWro  : Store32RO<0b10,   0, 0b00, GPR32, "str",
+                            [(store GPR32:$Rt, ro_indexed32:$addr)]>;
+def STRXro  : Store64RO<0b11,   0, 0b00, GPR64, "str",
+                            [(store GPR64:$Rt, ro_indexed64:$addr)]>;
+
+// truncstore i64
+def : Pat<(truncstorei8 GPR64:$Rt, ro_indexed8:$addr),
+           (STRBBro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed8:$addr)>;
+def : Pat<(truncstorei16 GPR64:$Rt, ro_indexed16:$addr),
+           (STRHHro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed16:$addr)>;
+def : Pat<(truncstorei32 GPR64:$Rt, ro_indexed32:$addr),
+           (STRWro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed32:$addr)>;
+
+
+// Floating-point
+def STRBro : Store8RO<0b00,  1, 0b00, FPR8,  "str",
+                            [(store FPR8:$Rt, ro_indexed8:$addr)]>;
+def STRHro : Store16RO<0b01, 1, 0b00, FPR16, "str",
+                            [(store FPR16:$Rt, ro_indexed16:$addr)]>;
+def STRSro : Store32RO<0b10,   1, 0b00, FPR32, "str",
+                            [(store (f32 FPR32:$Rt), ro_indexed32:$addr)]>;
+def STRDro : Store64RO<0b11,   1, 0b00, FPR64, "str",
+                            [(store (f64 FPR64:$Rt), ro_indexed64:$addr)]>;
+def STRQro : Store128RO<0b00,   1, 0b10, FPR128, "str", []> {
+  let mayStore = 1;
+}
+
+// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v2f32 FPR64:$Rn), ro_indexed64:$addr),
+          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
+def : Pat<(store (v1f64 FPR64:$Rn), ro_indexed64:$addr),
+          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
+def : Pat<(store (v8i8 FPR64:$Rn), ro_indexed64:$addr),
+          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
+def : Pat<(store (v4i16 FPR64:$Rn), ro_indexed64:$addr),
+          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
+def : Pat<(store (v2i32 FPR64:$Rn), ro_indexed64:$addr),
+          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
+def : Pat<(store (v1i64 FPR64:$Rn), ro_indexed64:$addr),
+          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (v4f32 FPR128:$Rn), ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+def : Pat<(store (v2f64 FPR128:$Rn), ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+def : Pat<(store (v16i8 FPR128:$Rn), ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+def : Pat<(store (v8i16 FPR128:$Rn), ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+def : Pat<(store (v4i32 FPR128:$Rn), ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+def : Pat<(store (v2i64 FPR128:$Rn), ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+def : Pat<(store (f128 FPR128:$Rn),  ro_indexed128:$addr),
+          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
+
+//---
+// (unsigned immediate)
+def STRXui : StoreUI<0b11, 0, 0b00, GPR64, am_indexed64, "str",
+                     [(store GPR64:$Rt, am_indexed64:$addr)]>;
+def STRWui : StoreUI<0b10, 0, 0b00, GPR32, am_indexed32, "str",
+                     [(store GPR32:$Rt, am_indexed32:$addr)]>;
+def STRBui : StoreUI<0b00, 1, 0b00, FPR8, am_indexed8, "str",
+                     [(store FPR8:$Rt, am_indexed8:$addr)]>;
+def STRHui : StoreUI<0b01, 1, 0b00, FPR16, am_indexed16, "str",
+                     [(store FPR16:$Rt, am_indexed16:$addr)]>;
+def STRSui : StoreUI<0b10, 1, 0b00, FPR32, am_indexed32, "str",
+                     [(store (f32 FPR32:$Rt), am_indexed32:$addr)]>;
+def STRDui : StoreUI<0b11, 1, 0b00, FPR64, am_indexed64, "str",
+                     [(store (f64 FPR64:$Rt), am_indexed64:$addr)]>;
+def STRQui : StoreUI<0b00, 1, 0b10, FPR128, am_indexed128, "str", []> {
+  let mayStore = 1;
+}
+
+// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v2f32 FPR64:$Rn), am_indexed64:$addr),
+          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
+def : Pat<(store (v1f64 FPR64:$Rn), am_indexed64:$addr),
+          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
+def : Pat<(store (v8i8 FPR64:$Rn), am_indexed64:$addr),
+          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
+def : Pat<(store (v4i16 FPR64:$Rn), am_indexed64:$addr),
+          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
+def : Pat<(store (v2i32 FPR64:$Rn), am_indexed64:$addr),
+          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
+def : Pat<(store (v1i64 FPR64:$Rn), am_indexed64:$addr),
+          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (v4f32 FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+def : Pat<(store (v2f64 FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+def : Pat<(store (v16i8 FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+def : Pat<(store (v8i16 FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+def : Pat<(store (v4i32 FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+def : Pat<(store (v2i64 FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+def : Pat<(store (f128  FPR128:$Rn), am_indexed128:$addr),
+          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
+
+def STRHHui : StoreUI<0b01, 0, 0b00, GPR32, am_indexed16, "strh",
+                      [(truncstorei16 GPR32:$Rt, am_indexed16:$addr)]>;
+def STRBBui : StoreUI<0b00, 0, 0b00, GPR32, am_indexed8,  "strb",
+                      [(truncstorei8 GPR32:$Rt, am_indexed8:$addr)]>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt, am_indexed32:$addr),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed32:$addr)>;
+def : Pat<(truncstorei16 GPR64:$Rt, am_indexed16:$addr),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed16:$addr)>;
+def : Pat<(truncstorei8 GPR64:$Rt, am_indexed8:$addr),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed8:$addr)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+def STURXi : StoreUnscaled<0b11, 0, 0b00, GPR64, am_unscaled64, "stur",
+                           [(store GPR64:$Rt, am_unscaled64:$addr)]>;
+def STURWi : StoreUnscaled<0b10, 0, 0b00, GPR32, am_unscaled32, "stur",
+                           [(store GPR32:$Rt, am_unscaled32:$addr)]>;
+def STURBi : StoreUnscaled<0b00, 1, 0b00, FPR8,  am_unscaled8, "stur",
+                           [(store FPR8:$Rt, am_unscaled8:$addr)]>;
+def STURHi : StoreUnscaled<0b01, 1, 0b00, FPR16, am_unscaled16, "stur",
+                           [(store FPR16:$Rt, am_unscaled16:$addr)]>;
+def STURSi : StoreUnscaled<0b10, 1, 0b00, FPR32, am_unscaled32, "stur",
+                           [(store (f32 FPR32:$Rt), am_unscaled32:$addr)]>;
+def STURDi : StoreUnscaled<0b11, 1, 0b00, FPR64, am_unscaled64, "stur",
+                           [(store (f64 FPR64:$Rt), am_unscaled64:$addr)]>;
+def STURQi : StoreUnscaled<0b00, 1, 0b10, FPR128, am_unscaled128, "stur",
+                           [(store (v2f64 FPR128:$Rt), am_unscaled128:$addr)]>;
+def STURHHi : StoreUnscaled<0b01, 0, 0b00, GPR32, am_unscaled16, "sturh",
+                            [(truncstorei16 GPR32:$Rt, am_unscaled16:$addr)]>;
+def STURBBi : StoreUnscaled<0b00, 0, 0b00, GPR32, am_unscaled8, "sturb",
+                            [(truncstorei8 GPR32:$Rt, am_unscaled8:$addr)]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v2f32 FPR64:$Rn), am_unscaled64:$addr),
+          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
+def : Pat<(store (v1f64 FPR64:$Rn), am_unscaled64:$addr),
+          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
+def : Pat<(store (v8i8 FPR64:$Rn), am_unscaled64:$addr),
+          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
+def : Pat<(store (v4i16 FPR64:$Rn), am_unscaled64:$addr),
+          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
+def : Pat<(store (v2i32 FPR64:$Rn), am_unscaled64:$addr),
+          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
+def : Pat<(store (v1i64 FPR64:$Rn), am_unscaled64:$addr),
+          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (v4f32 FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+def : Pat<(store (v2f64 FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+def : Pat<(store (v16i8 FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+def : Pat<(store (v8i16 FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+def : Pat<(store (v4i32 FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+def : Pat<(store (v2i64 FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+def : Pat<(store (f128  FPR128:$Rn), am_unscaled128:$addr),
+          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, am_unscaled32:$addr),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled32:$addr)>;
+def : Pat<(truncstorei16 GPR64:$Rt, am_unscaled16:$addr),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled16:$addr)>;
+def : Pat<(truncstorei8 GPR64:$Rt, am_unscaled8:$addr),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled8:$addr)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, $addr", (STURXi GPR64:$Rt, am_unscaled_fb64:$addr)>;
+def : InstAlias<"str $Rt, $addr", (STURWi GPR32:$Rt, am_unscaled_fb32:$addr)>;
+def : InstAlias<"str $Rt, $addr", (STURBi FPR8:$Rt, am_unscaled_fb8:$addr)>;
+def : InstAlias<"str $Rt, $addr", (STURHi FPR16:$Rt, am_unscaled_fb16:$addr)>;
+def : InstAlias<"str $Rt, $addr", (STURSi FPR32:$Rt, am_unscaled_fb32:$addr)>;
+def : InstAlias<"str $Rt, $addr", (STURDi FPR64:$Rt, am_unscaled_fb64:$addr)>;
+def : InstAlias<"str $Rt, $addr", (STURQi FPR128:$Rt, am_unscaled_fb128:$addr)>;
+
+def : InstAlias<"strb $Rt, $addr", (STURBBi GPR32:$Rt, am_unscaled_fb8:$addr)>;
+def : InstAlias<"strh $Rt, $addr", (STURHHi GPR32:$Rt, am_unscaled_fb16:$addr)>;
+
+//---
+// (unscaled immediate, unprivileged)
+def STTRWi : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+def STTRXi : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+def STTRHi : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+def STTRBi : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str">;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str">;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str">;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str">;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str">;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str">;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str">;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb">;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh">;
+
+// ISel pseudos and patterns. See expanded comment on StorePreIdxPseudo.
+defm STRDpre : StorePreIdxPseudo<FPR64, f64, pre_store>;
+defm STRSpre : StorePreIdxPseudo<FPR32, f32, pre_store>;
+defm STRXpre : StorePreIdxPseudo<GPR64, i64, pre_store>;
+defm STRWpre : StorePreIdxPseudo<GPR32, i32, pre_store>;
+defm STRHHpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti16>;
+defm STRBBpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti8>;
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
+  (STRWpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
+                  simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
+  (STRHHpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
+                  simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
+  (STRBBpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
+                  simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str">;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str">;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,  "str">;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str">;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str">;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str">;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str">;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb">;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh">;
+
+// ISel pseudos and patterns. See expanded comment on StorePostIdxPseudo.
+defm STRDpost : StorePostIdxPseudo<FPR64, f64, post_store, STRDpost>;
+defm STRSpost : StorePostIdxPseudo<FPR32, f32, post_store, STRSpost>;
+defm STRXpost : StorePostIdxPseudo<GPR64, i64, post_store, STRXpost>;
+defm STRWpost : StorePostIdxPseudo<GPR32, i32, post_store, STRWpost>;
+defm STRHHpost : StorePostIdxPseudo<GPR32, i32, post_truncsti16, STRHHpost>;
+defm STRBBpost : StorePostIdxPseudo<GPR32, i32, post_truncsti8, STRBBpost>;
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
+  (STRWpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
+                  simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
+  (STRHHpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
+                  simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
+  (STRBBpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
+                  simm9:$off)>;
+
+
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
+
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
+
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
+
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
+
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
+
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
+
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
+
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
+
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
+
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
+
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
+
+//===----------------------------------------------------------------------===//
+// Scaled floating point to integer conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCVTAS : FPToInteger<0b00, 0b100, "fcvtas", int_arm64_neon_fcvtas>;
+defm FCVTAU : FPToInteger<0b00, 0b101, "fcvtau", int_arm64_neon_fcvtau>;
+defm FCVTMS : FPToInteger<0b10, 0b000, "fcvtms", int_arm64_neon_fcvtms>;
+defm FCVTMU : FPToInteger<0b10, 0b001, "fcvtmu", int_arm64_neon_fcvtmu>;
+defm FCVTNS : FPToInteger<0b00, 0b000, "fcvtns", int_arm64_neon_fcvtns>;
+defm FCVTNU : FPToInteger<0b00, 0b001, "fcvtnu", int_arm64_neon_fcvtnu>;
+defm FCVTPS : FPToInteger<0b01, 0b000, "fcvtps", int_arm64_neon_fcvtps>;
+defm FCVTPU : FPToInteger<0b01, 0b001, "fcvtpu", int_arm64_neon_fcvtpu>;
+defm FCVTZS : FPToInteger<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToInteger<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToInteger<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToInteger<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled integer to floating point conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Unscaled integer to floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FMOV : UnscaledConversion<"fmov">;
+
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
+
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))), (FMOVDXr V64:$Vn)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (FMOVDXr V64:$Vn)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn,
+                                                                FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), (COPY_TO_REGCLASS FPR32:$Xn,
+                                                                GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), (COPY_TO_REGCLASS GPR64:$Xn,
+                                                                FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), (COPY_TO_REGCLASS FPR64:$Xn,
+                                                                GPR64)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCVT : FPConversion<"fcvt">;
+
+def : Pat<(f32_to_f16 FPR32:$Rn),
+          (i32 (COPY_TO_REGCLASS
+                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
+                   GPR32))>;
+
+def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
+                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point single operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_arm64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+
+def : Pat<(v1f64 (int_arm64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
+
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
+}
+
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point two operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_arm64_neon_fmaxnm>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", ARM64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_arm64_neon_fminnm>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", ARM64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (ARM64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (ARM64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_arm64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_arm64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point three operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Rd)),
+          (FMSUBSrrr FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Rd)),
+          (FMSUBDrrr FPR64:$Rd, FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", ARM64fcmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp">;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCSEL : FPCondSelect<"fcsel">;
+
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (ARM64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), CPSR))]> {
+  let Uses = [CPSR];
+  let usesCustomInserter = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating point immediate move.
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_arm64_neon_abs>;
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_arm64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", ARM64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", ARM64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", ARM64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", ARM64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", ARM64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_arm64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_arm64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_arm64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_arm64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_arm64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_arm64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_arm64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_arm64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_arm64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+                                       int_arm64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+                                       int_arm64_neon_fcvtzu>;
+}
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_arm64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_arm64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_arm64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn.8b $Vd, $Vn", (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn.16b $Vd, $Vn", (NOTv16i8 V128:$Vd, V128:$Vn)>;
+def : InstAlias<"mvn $Vd.8b, $Vn.8b", (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn $Vd.16b, $Vn.16b", (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(ARM64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(ARM64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(ARM64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(ARM64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(ARM64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(ARM64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(ARM64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(ARM64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(ARM64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(ARM64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(ARM64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(ARM64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(ARM64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(ARM64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_arm64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", ARM64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", ARM64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", ARM64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_arm64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_arm64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_arm64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_arm64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_arm64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_arm64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_arm64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_arm64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_arm64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_arm64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_arm64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (ARM64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (ARM64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(ARM64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(ARM64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(ARM64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(ARM64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(ARM64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(ARM64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_arm64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", ARM64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", ARM64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", ARM64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", ARM64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", ARM64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", ARM64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_arm64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_arm64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_arm64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_arm64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_arm64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_arm64_neon_fmaxnm>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_arm64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", ARM64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_arm64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_arm64_neon_fminnm>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_arm64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", ARM64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_arm64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_arm64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_arm64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_arm64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_arm64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_arm64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_arm64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_arm64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_arm64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_arm64_neon_smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_arm64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_arm64_neon_smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_arm64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_arm64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_arm64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_arm64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_arm64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_arm64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_arm64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_arm64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_arm64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_arm64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_arm64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_arm64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_arm64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_arm64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_arm64_neon_umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_arm64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_arm64_neon_umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_arm64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_arm64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_arm64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_arm64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_arm64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_arm64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_arm64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", ARM64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+// FIXME: the .16b and .8b variantes should be emitted by the
+// AsmWriter. TableGen's AsmWriter-generator doesn't deal with variant syntaxes
+// in aliases yet though.
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"{mov\t$dst.8h, $src.8h|mov.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"{mov\t$dst.4s, $src.4s|mov.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"{mov\t$dst.2d, $src.2d|mov.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"{mov\t$dst.8b, $src.8b|mov.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"{mov\t$dst.4h, $src.4h|mov.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"{mov\t$dst.2s, $src.2s|mov.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"{mov\t$dst.1d, $src.1d|mov.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", ARM64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", ARM64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", ARM64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", ARM64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", ARM64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", ARM64cmtst>;
+defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_arm64_sisd_fabd>;
+def : Pat<(v1f64 (int_arm64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+                                     int_arm64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+                                     int_arm64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
+defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_arm64_neon_fmulx>;
+defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_arm64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_arm64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_arm64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_arm64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_arm64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_arm64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_arm64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_arm64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_arm64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_arm64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_arm64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_arm64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_arm64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_arm64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_arm64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_arm64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions (mixed operands).
+//===----------------------------------------------------------------------===//
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_arm64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_arm64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_arm64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_arm64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", ARM64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", ARM64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", ARM64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", ARM64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", ARM64cmltz>;
+defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
+defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
+defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
+defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
+defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", ARM64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_arm64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_arm64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_arm64_neon_suqadd>;
+defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", ARM64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_arm64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_arm64_neon_usqadd>;
+
+def : Pat<(v1i64 (int_arm64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_arm64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_arm64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_arm64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_arm64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_arm64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_arm64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBro ro_indexed8:$addr), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui am_indexed8:$addr), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi am_unscaled8:$addr), bsub))>;
+// 16-bits -> float.
+def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHro ro_indexed16:$addr), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui am_indexed16:$addr), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi am_unscaled16:$addr), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBro ro_indexed8:$addr), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui am_indexed8:$addr), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi am_unscaled8:$addr), bsub))>;
+// 16-bits -> double.
+def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHro ro_indexed16:$addr), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui am_indexed16:$addr), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi am_unscaled16:$addr), hsub))>;
+// 32-bits -> double.
+def : Pat <(f64 (uint_to_fp (i32 (load ro_indexed32:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSro ro_indexed32:$addr), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32 (load am_indexed32:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui am_indexed32:$addr), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32 (load am_unscaled32:$addr)))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi am_unscaled32:$addr), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_arm64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_arm64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_arm64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_arm64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_arm64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_arm64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_arm64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_arm64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_arm64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_arm64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_arm64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_arm64_neon_uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_arm64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_arm64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_arm64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_arm64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", ARM64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", ARM64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", ARM64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", ARM64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", ARM64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", ARM64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_arm64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_arm64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_arm64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_arm64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_arm64_neon_saddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_arm64_neon_uaddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_arm64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_arm64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_arm64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_arm64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_arm64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_arm64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_arm64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_arm64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (ARM64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (ARM64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (ARM64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+
+def : Pat<(v2f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for ARM64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (EXTRACT_SUBREG
+            (INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (EXTRACT_SUBREG
+            (INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_arm64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_arm64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_arm64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_arm64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (EXTRACT_SUBREG
+            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexD:$idx),
+            dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (EXTRACT_SUBREG
+            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexS:$idx),
+            ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// ARM64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_arm64_neon_fmaxnmv>;
+defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_arm64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_arm64_neon_fminnmv>;
+defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_arm64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_arm64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_arm64_neon_saddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_arm64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_arm64_neon_uaddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_arm64_neon_smaxv>;
+def : Pat<(i32 (int_arm64_neon_smaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_arm64_neon_sminv>;
+def : Pat<(i32 (int_arm64_neon_sminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_arm64_neon_umaxv>;
+def : Pat<(i32 (int_arm64_neon_umaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_arm64_neon_uminv>;
+def : Pat<(i32 (int_arm64_neon_uminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_arm64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_arm64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_arm64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_arm64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", ARM64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", ARM64orri>;
+
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (ARM64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (ARM64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (ARM64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32   fpimm0),
+          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+      Requires<[HasZCZ]>;
+def : Pat<(f64   fpimm0),
+          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+      Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+def : Pat<(v2i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (ARM64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (ARM64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+def : Pat<(v2i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (ARM64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (ARM64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (ARM64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (ARM64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_arm64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (ARM64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_arm64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_arm64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_arm64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_arm64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_arm64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_arm64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_arm64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_arm64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_arm64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_arm64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_arm64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_arm64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_arm64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_arm64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_arm64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_arm64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_arm64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", ARM64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_arm64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_arm64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_arm64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_arm64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", ARM64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (ARM64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", ARM64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (ARM64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_arm64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_arm64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", ARM64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (ARM64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", ARM64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (ARM64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_arm64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_arm64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+                                   int_arm64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_arm64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", ARM64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (ARM64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_arm64_neon_vsli>;
+def : Pat<(v1i64 (int_arm64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_arm64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_arm64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_arm64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_arm64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_arm64_neon_vsri>;
+def : Pat<(v1i64 (int_arm64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", ARM64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (ARM64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(ARM64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", ARM64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (ARM64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+                        int_arm64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_arm64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_arm64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", ARM64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (ARM64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(ARM64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", ARM64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (ARM64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (ARM64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (ARM64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (ARM64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (ARM64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (ARM64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (ARM64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 ro_indexed8:$addr)))),
+           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                              (SSHLLv4i16_shift
+                                (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv8i8_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                                  (LDRBro ro_indexed8:$addr),
+                                                  bsub),
+                                     0),
+                                   dsub)),
+                               0),
+                           ssub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_indexed8:$addr)))),
+           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                              (SSHLLv4i16_shift
+                                (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv8i8_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                                  (LDRBui am_indexed8:$addr),
+                                                  bsub),
+                                     0),
+                                   dsub)),
+                               0),
+                           ssub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_unscaled8:$addr)))),
+           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                              (SSHLLv4i16_shift
+                                (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv8i8_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                                  (LDURBi am_unscaled8:$addr),
+                                                  bsub),
+                                     0),
+                                   dsub)),
+                               0),
+                           ssub)))>, Requires<[NotForCodeSize]>;
+// 16-bits -> float. 1 size step-up.
+def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
+           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                              (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                               (LDRHro ro_indexed16:$addr),
+                                               hsub),
+                               0),
+                           ssub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
+           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                              (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                               (LDRHui am_indexed16:$addr),
+                                               hsub),
+                               0),
+                           ssub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
+           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                              (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                               (LDURHi am_unscaled16:$addr),
+                                               hsub),
+                               0),
+                           ssub)))>, Requires<[NotForCodeSize]>;
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                                  (LDRHro ro_indexed16:$addr),
+                                                  hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
+           (SCVTFv1i64  (f64 (EXTRACT_SUBREG
+                               (SSHLLv2i32_shift
+                                 (f64
+                                   (EXTRACT_SUBREG
+                                     (SSHLLv4i16_shift
+                                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                                  (LDRHui am_indexed16:$addr),
+                                                  hsub),
+                                      0),
+                                    dsub)),
+                                 0),
+                              dsub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                     (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                                  (LDURHi am_unscaled16:$addr),
+                                                  hsub),
+                                      0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+// 32-bits -> double. 1 size step-up.
+def : Pat <(f64 (sint_to_fp (i32 (load ro_indexed32:$addr)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                               (LDRSro ro_indexed32:$addr),
+                                               ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f64 (sint_to_fp (i32 (load am_indexed32:$addr)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                               (LDRSui am_indexed32:$addr),
+                                               ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+def : Pat <(f64 (sint_to_fp (i32 (load am_unscaled32:$addr)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                               (LDURSi am_unscaled32:$addr),
+                                               ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load am_simdnoindex:$vaddr)), (INST am_simdnoindex:$vaddr)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, am_simdnoindex:$vaddr),
+        (INST ty:$Vt, am_simdnoindex:$vaddr)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
+          (LD1Rv8b am_simdnoindex:$vaddr)>;
+def : Pat<(v16i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
+          (LD1Rv16b am_simdnoindex:$vaddr)>;
+def : Pat<(v4i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
+          (LD1Rv4h am_simdnoindex:$vaddr)>;
+def : Pat<(v8i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
+          (LD1Rv8h am_simdnoindex:$vaddr)>;
+def : Pat<(v2i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv2s am_simdnoindex:$vaddr)>;
+def : Pat<(v4i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv4s am_simdnoindex:$vaddr)>;
+def : Pat<(v2i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv2d am_simdnoindex:$vaddr)>;
+def : Pat<(v1i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv1d am_simdnoindex:$vaddr)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv2s am_simdnoindex:$vaddr)>;
+def : Pat<(v4f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv4s am_simdnoindex:$vaddr)>;
+def : Pat<(v2f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv2d am_simdnoindex:$vaddr)>;
+def : Pat<(v1f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
+          (LD1Rv1d am_simdnoindex:$vaddr)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, am_simdnoindex:$vaddr),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 8 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             am_simdnoindex:$vaddr),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 8 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             am_simdnoindex:$vaddr),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_arm64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_arm64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_arm64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_arm64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_arm64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_arm64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_arm64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_arm64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_arm64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_arm64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_arm64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_arm64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_arm64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_arm64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on ARM64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_arm64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_arm64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_arm64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
+}
+
+def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
+def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+
+include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp b/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
new file mode 100644
index 0000000..c0031a4
--- /dev/null
+++ b/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
@@ -0,0 +1,947 @@
+//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-ldst-opt"
+#include "ARM64InstrInfo.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// load / store instructions to form ldp / stp instructions.
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+          "Number of load/store from unscaled generated");
+
+static cl::opt<bool> DoLoadStoreOpt("arm64-load-store-opt", cl::init(true),
+                                    cl::Hidden);
+static cl::opt<unsigned> ScanLimit("arm64-load-store-scan-limit", cl::init(20),
+                                   cl::Hidden);
+
+// Place holder while testing unscaled load/store combining
+static cl::opt<bool>
+EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden,
+                         cl::desc("Allow ARM64 unscaled load/store combining"),
+                         cl::init(true));
+
+namespace {
+struct ARM64LoadStoreOpt : public MachineFunctionPass {
+  static char ID;
+  ARM64LoadStoreOpt() : MachineFunctionPass(ID) {}
+
+  const ARM64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  // Scan the instructions looking for a load/store that can be combined
+  // with the current instruction into a load/store pair.
+  // Return the matching instruction if one is found, else MBB->end().
+  // If a matching instruction is found, mergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+                                               bool &mergeForward,
+                                               unsigned Limit);
+  // Merge the two instructions indicated into a single pair-wise instruction.
+  // If mergeForward is true, erase the first instruction and fold its
+  // operation into the second. If false, the reverse. Return the instruction
+  // following the first instruction (which may change during proecessing).
+  MachineBasicBlock::iterator
+  mergePairedInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator Paired, bool mergeForward);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
+                                int Value);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan backwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+  // Merge a pre-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator Update);
+
+  // Merge a post-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                         MachineBasicBlock::iterator Update);
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  virtual const char *getPassName() const {
+    return "ARM64 load / store optimization pass";
+  }
+
+private:
+  int getMemSize(MachineInstr *MemMI);
+};
+char ARM64LoadStoreOpt::ID = 0;
+}
+
+static bool isUnscaledLdst(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case ARM64::STURSi:
+    return true;
+  case ARM64::STURDi:
+    return true;
+  case ARM64::STURQi:
+    return true;
+  case ARM64::STURWi:
+    return true;
+  case ARM64::STURXi:
+    return true;
+  case ARM64::LDURSi:
+    return true;
+  case ARM64::LDURDi:
+    return true;
+  case ARM64::LDURQi:
+    return true;
+  case ARM64::LDURWi:
+    return true;
+  case ARM64::LDURXi:
+    return true;
+  }
+}
+
+// Size in bytes of the data moved by an unscaled load or store
+int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+  switch (MemMI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has has unknown size!");
+  case ARM64::STRSui:
+  case ARM64::STURSi:
+    return 4;
+  case ARM64::STRDui:
+  case ARM64::STURDi:
+    return 8;
+  case ARM64::STRQui:
+  case ARM64::STURQi:
+    return 16;
+  case ARM64::STRWui:
+  case ARM64::STURWi:
+    return 4;
+  case ARM64::STRXui:
+  case ARM64::STURXi:
+    return 8;
+  case ARM64::LDRSui:
+  case ARM64::LDURSi:
+    return 4;
+  case ARM64::LDRDui:
+  case ARM64::LDURDi:
+    return 8;
+  case ARM64::LDRQui:
+  case ARM64::LDURQi:
+    return 16;
+  case ARM64::LDRWui:
+  case ARM64::LDURWi:
+    return 4;
+  case ARM64::LDRXui:
+  case ARM64::LDURXi:
+    return 8;
+  }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case ARM64::STRSui:
+  case ARM64::STURSi:
+    return ARM64::STPSi;
+  case ARM64::STRDui:
+  case ARM64::STURDi:
+    return ARM64::STPDi;
+  case ARM64::STRQui:
+  case ARM64::STURQi:
+    return ARM64::STPQi;
+  case ARM64::STRWui:
+  case ARM64::STURWi:
+    return ARM64::STPWi;
+  case ARM64::STRXui:
+  case ARM64::STURXi:
+    return ARM64::STPXi;
+  case ARM64::LDRSui:
+  case ARM64::LDURSi:
+    return ARM64::LDPSi;
+  case ARM64::LDRDui:
+  case ARM64::LDURDi:
+    return ARM64::LDPDi;
+  case ARM64::LDRQui:
+  case ARM64::LDURQi:
+    return ARM64::LDPQi;
+  case ARM64::LDRWui:
+  case ARM64::LDURWi:
+    return ARM64::LDPWi;
+  case ARM64::LDRXui:
+  case ARM64::LDURXi:
+    return ARM64::LDPXi;
+  }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pre-indexed equivalent!");
+  case ARM64::STRSui:    return ARM64::STRSpre;
+  case ARM64::STRDui:    return ARM64::STRDpre;
+  case ARM64::STRQui:    return ARM64::STRQpre;
+  case ARM64::STRWui:    return ARM64::STRWpre;
+  case ARM64::STRXui:    return ARM64::STRXpre;
+  case ARM64::LDRSui:    return ARM64::LDRSpre;
+  case ARM64::LDRDui:    return ARM64::LDRDpre;
+  case ARM64::LDRQui:    return ARM64::LDRQpre;
+  case ARM64::LDRWui:    return ARM64::LDRWpre;
+  case ARM64::LDRXui:    return ARM64::LDRXpre;
+  }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+  case ARM64::STRSui:
+    return ARM64::STRSpost;
+  case ARM64::STRDui:
+    return ARM64::STRDpost;
+  case ARM64::STRQui:
+    return ARM64::STRQpost;
+  case ARM64::STRWui:
+    return ARM64::STRWpost;
+  case ARM64::STRXui:
+    return ARM64::STRXpost;
+  case ARM64::LDRSui:
+    return ARM64::LDRSpost;
+  case ARM64::LDRDui:
+    return ARM64::LDRDpost;
+  case ARM64::LDRQui:
+    return ARM64::LDRQpost;
+  case ARM64::LDRWui:
+    return ARM64::LDRWpost;
+  case ARM64::LDRXui:
+    return ARM64::LDRXpost;
+  }
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                    MachineBasicBlock::iterator Paired,
+                                    bool mergeForward) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+  int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1;
+
+  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+  // Insert our new paired instruction after whichever of the paired
+  // instructions mergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
+  // Also based on mergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  MachineOperand &BaseRegOp =
+      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (I->getOperand(2).getImm() ==
+      Paired->getOperand(2).getImm() + OffsetStride) {
+    RtMI = Paired;
+    Rt2MI = I;
+  } else {
+    RtMI = I;
+    Rt2MI = Paired;
+  }
+  // Handle Unscaled
+  int OffsetImm = RtMI->getOperand(2).getImm();
+  if (IsUnscaled && EnableARM64UnscaledMemOp)
+    OffsetImm /= OffsetStride;
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+                                    I->getDebugLoc(), TII->get(NewOpc))
+                                .addOperand(RtMI->getOperand(0))
+                                .addOperand(Rt2MI->getOperand(0))
+                                .addOperand(BaseRegOp)
+                                .addImm(OffsetImm);
+  (void)MIB;
+
+  // FIXME: Do we need/want to copy the mem operands from the source
+  //        instructions? Probably. What uses them after this?
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (MO.isDef()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+  if (!IsUnscaled && (Offset > 63 || Offset < -64))
+    return false;
+  if (IsUnscaled) {
+    // Convert the byte-offset used by unscaled into an "element" offset used
+    // by the scaled pair load/store instructions.
+    int elemOffset = Offset / OffsetStride;
+    if (elemOffset > 63 || elemOffset < -64)
+      return false;
+  }
+  return true;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                    bool &mergeForward, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  int Opc = FirstMI->getOpcode();
+  bool mayLoad = FirstMI->mayLoad();
+  bool IsUnscaled = isUnscaledLdst(Opc);
+  unsigned Reg = FirstMI->getOperand(0).getReg();
+  unsigned BaseReg = FirstMI->getOperand(1).getReg();
+  int Offset = FirstMI->getOperand(2).getImm();
+
+  // Early exit if the first instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  // Early exit if the offset if not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1
+  if (FirstMI->modifiesRegister(BaseReg, TRI))
+    return E;
+  int OffsetStride =
+      IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      // These instructions all have scaled immediate operands, so we just
+      // check for +1/-1. Make sure to check the new instruction offset is
+      // actually an immediate and not a symbolic reference destined for
+      // a relocation.
+      //
+      // Pairwise instructions have a 7-bit signed offset field. Single insns
+      // have a 12-bit unsigned offset field. To be a valid combine, the
+      // final offset must be in range.
+      unsigned MIBaseReg = MI->getOperand(1).getReg();
+      int MIOffset = MI->getOperand(2).getImm();
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        // If this is a volatile load/store that otherwise matched, stop looking
+        // as something is going on that we don't have enough information to
+        // safely transform. Similarly, stop if we see a hint to avoid pairs.
+        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+          return E;
+        // If the resultant immediate offset of merging these instructions
+        // is out of range for a pairwise instruction, bail and keep looking.
+        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
+        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the alignment requirements of the paired (scaled) instruction
+        // can't express the offset of the unscaled input, bail and keep
+        // looking.
+        if (IsUnscaled && EnableARM64UnscaledMemOp &&
+            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions, we can combine the second into the first.
+        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+            !UsedRegs[MI->getOperand(0).getReg()]) {
+          mergeForward = false;
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions, we can combine the first into the
+        // second.
+        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+          mergeForward = true;
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store, but does (or can)
+    // modify memory, stop searching, as we don't have alias analysis or
+    // anything like that to tell us whether the access is tromping on the
+    // locations we care about. The big one we want to catch is calls.
+    //
+    // FIXME: Theoretically, we can do better than that for SP and FP based
+    // references since we can effectively know where those are touching. It's
+    // unclear if it's worth the extra code, though. Most paired instructions
+    // will be sequential, perhaps with a few intervening non-memory related
+    // instructions.
+    if (MI->mayStore() || MI->isCall())
+      return E;
+    // Likewise, if we're matching a store instruction, we don't want to
+    // move across a load, as it may be reading the same location.
+    if (FirstMI->mayStore() && MI->mayLoad())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                         MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == ARM64::ADDXri ||
+          Update->getOpcode() == ARM64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into pre-indexed load / store");
+  if (Update->getOpcode() == ARM64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == ARM64::ADDXri ||
+          Update->getOpcode() == ARM64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into post-indexed load / store");
+  if (Update->getOpcode() == ARM64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating post-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
+                                 int Offset) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case ARM64::SUBXri:
+    // Negate the offset for a SUB instruction.
+    Offset *= -1;
+  // FALLTHROUGH
+  case ARM64::ADDXri:
+    // Make sure it's a vanilla immediate operand, not a relocation or
+    // anything else we can't handle.
+    if (!MI->getOperand(2).isImm())
+      break;
+    // Watch out for 1 << 12 shifted value.
+    if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm()))
+      break;
+    // If the instruction has the base register as source and dest and the
+    // immediate will fit in a signed 9-bit integer, then we have a match.
+    if (MI->getOperand(0).getReg() == BaseReg &&
+        MI->getOperand(1).getReg() == BaseReg &&
+        MI->getOperand(2).getImm() <= 255 &&
+        MI->getOperand(2).getImm() >= -256) {
+      // If we have a non-zero Offset, we check that it matches the amount
+      // we're adding to the register.
+      if (!Offset || Offset == MI->getOperand(2).getImm())
+        return true;
+    }
+    break;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+                                                 unsigned Limit, int Value) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm() *
+               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Scan forward looking for post-index opportunities.
+  // Updating instructions can't be formed if the memory insn already
+  // has an offset other than the value we're looking for.
+  if (Offset != Value)
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  ++MBBI;
+  for (unsigned Count = 0; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, Value))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
+                                                  unsigned Limit) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm();
+  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the load/store is the first instruction in the block, there's obviously
+  // not any matching update. Ditto if the memory offset isn't zero.
+  if (MBBI == B || Offset != 0)
+    return E;
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  --MBBI;
+  for (unsigned Count = 0; MBBI != B; --MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // Two tranformations to do here:
+  // 1) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
+  // 2) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case ARM64::STRSui:
+    case ARM64::STRDui:
+    case ARM64::STRQui:
+    case ARM64::STRXui:
+    case ARM64::STRWui:
+    case ARM64::LDRSui:
+    case ARM64::LDRDui:
+    case ARM64::LDRQui:
+    case ARM64::LDRXui:
+    case ARM64::LDRWui:
+    // do the unscaled versions as well
+    case ARM64::STURSi:
+    case ARM64::STURDi:
+    case ARM64::STURQi:
+    case ARM64::STURWi:
+    case ARM64::STURXi:
+    case ARM64::LDURSi:
+    case ARM64::LDURDi:
+    case ARM64::LDURQi:
+    case ARM64::LDURWi:
+    case ARM64::LDURXi: {
+      // If this is a volatile load/store, don't mess with it.
+      if (MI->hasOrderedMemoryRef()) {
+        ++MBBI;
+        break;
+      }
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Check if this load/store has a hint to avoid pair formation.
+      // MachineMemOperands hints are set by the ARM64StorePairSuppress pass.
+      if (TII->isLdStPairSuppressed(MI)) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a pairable instruction.
+      bool mergeForward = false;
+      MachineBasicBlock::iterator Paired =
+          findMatchingInsn(MBBI, mergeForward, ScanLimit);
+      if (Paired != E) {
+        // Merge the loads into a pair. Keeping the iterator straight is a
+        // pain, so we let the merge routine tell us what the next instruction
+        // is after it's done mucking about.
+        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+
+        Modified = true;
+        ++NumPairCreated;
+        if (isUnscaledLdst(MI->getOpcode()))
+          ++NumUnscaledPairCreated;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    // Do update merging. It's simpler to keep this separate from the above
+    // switch, though not strictly necessary.
+    int Opc = MI->getOpcode();
+    switch (Opc) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case ARM64::STRSui:
+    case ARM64::STRDui:
+    case ARM64::STRQui:
+    case ARM64::STRXui:
+    case ARM64::STRWui:
+    case ARM64::LDRSui:
+    case ARM64::LDRDui:
+    case ARM64::LDRQui:
+    case ARM64::LDRXui:
+    case ARM64::LDRWui:
+    // do the unscaled versions as well
+    case ARM64::STURSi:
+    case ARM64::STURDi:
+    case ARM64::STURQi:
+    case ARM64::STURWi:
+    case ARM64::STURXi:
+    case ARM64::LDURSi:
+    case ARM64::LDURDi:
+    case ARM64::LDURQi:
+    case ARM64::LDURWi:
+    case ARM64::LDURXi: {
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a mergable instruction.
+      MachineBasicBlock::iterator Update =
+          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPostFolded;
+        break;
+      }
+      // Don't know how to handle pre/post-index versions, so move to the next
+      // instruction.
+      if (isUnscaledLdst(Opc)) {
+        ++MBBI;
+        break;
+      }
+
+      // Look back to try to find a pre-index instruction. For example,
+      // add x0, x0, #8
+      // ldr x1, [x0]
+      //   merged into:
+      // ldr x1, [x0, #8]!
+      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Look forward to try to find a post-index instruction. For example,
+      // ldr x1, [x0, #64]
+      // add x0, x0, #64
+      //   merged into:
+      // ldr x1, [x0], #64
+
+      // The immediate in the load/store is scaled by the size of the register
+      // being loaded. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int Value = MI->getOperand(2).getImm() *
+                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
+                      ->getSize();
+      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Nothing found. Just move to the next instruction.
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  return Modified;
+}
+
+bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  // Early exit if pass disabled.
+  if (!DoLoadStoreOpt)
+    return false;
+
+  const TargetMachine &TM = Fn.getTarget();
+  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+
+  bool Modified = false;
+  for (auto &MBB : Fn)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
+// loads and stores near one another?
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARM64LoadStoreOptimizationPass() {
+  return new ARM64LoadStoreOpt();
+}
diff --git a/lib/Target/ARM64/ARM64MCInstLower.cpp b/lib/Target/ARM64/ARM64MCInstLower.cpp
new file mode 100644
index 0000000..01dc229
--- /dev/null
+++ b/lib/Target/ARM64/ARM64MCInstLower.cpp
@@ -0,0 +1,201 @@
+//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCInstLower.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang,
+                                   AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
+
+MCSymbol *
+ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                     MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
+             ARM64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
+             ARM64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
+             ARM64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                  MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
+
+  if (MO.getTargetFlags() & ARM64II::MO_GOT)
+    RefFlags |= ARM64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & ARM64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= ARM64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= ARM64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= ARM64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= ARM64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= ARM64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
+    RefFlags |= ARM64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF)
+    RefFlags |= ARM64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3)
+    RefFlags |= ARM64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2)
+    RefFlags |= ARM64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1)
+    RefFlags |= ARM64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0)
+    RefFlags |= ARM64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & ARM64II::MO_NC)
+    RefFlags |= ARM64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  ARM64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<ARM64MCExpr::VariantKind>(RefFlags);
+  Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx);
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                    MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    assert(0 && "unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  }
+  return true;
+}
+
+void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MCOperand MCOp;
+    if (lowerOperand(MI->getOperand(i), MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/ARM64/ARM64MCInstLower.h b/lib/Target/ARM64/ARM64MCInstLower.h
new file mode 100644
index 0000000..7e3a2c8
--- /dev/null
+++ b/lib/Target/ARM64/ARM64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64_MCINSTLOWER_H
+#define ARM64_MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// ARM64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+  Triple TargetTriple;
+
+public:
+  ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                     MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+                                  MCSymbol *Sym) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
new file mode 100644
index 0000000..02bf7cf
--- /dev/null
+++ b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
@@ -0,0 +1,139 @@
+//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64MACHINEFUNCTIONINFO_H
+#define ARM64MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+
+namespace llvm {
+
+/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM64-specific information for each MachineFunction.
+class ARM64FunctionInfo : public MachineFunctionInfo {
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
+
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
+
+public:
+  ARM64FunctionInfo()
+      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
+        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
+        VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit ARM64FunctionInfo(MachineFunction &MF)
+      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
+        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
+        VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
+
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
+
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
+
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
+
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
+
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
+
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
+
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    const LOHArgs &getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
+} // End llvm namespace
+
+#endif // ARM64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64PerfectShuffle.h b/lib/Target/ARM64/ARM64PerfectShuffle.h
new file mode 100644
index 0000000..6759236
--- /dev/null
+++ b/lib/Target/ARM64/ARM64PerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- ARM64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/lib/Target/ARM64/ARM64PromoteConstant.cpp b/lib/Target/ARM64/ARM64PromoteConstant.cpp
new file mode 100644
index 0000000..9fbaedb
--- /dev/null
+++ b/lib/Target/ARM64/ARM64PromoteConstant.cpp
@@ -0,0 +1,585 @@
+
+//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64PromoteConstant pass which promotes constant
+// to global variables when this is likely to be more efficient.
+// Currently only types related to constant vector (i.e., constant vector, array
+// of constant vectors, constant structure with a constant vector field, etc.)
+// are promoted to global variables.
+// Indeed, constant vector are likely to be lowered in target constant pool
+// during instruction selection.
+// Therefore, the access will remain the same (memory load), but the structures
+// types are not split into different constant pool accesses for each field.
+// The bonus side effect is that created globals may be merged by the global
+// merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-promote-const"
+#include "ARM64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("arm64-stress-promote-const", cl::Hidden,
+                            cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+//                       ARM64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+///   uint8x16x4_t ret;
+///   ret.val[0] = vld1q_u16(TableA +  0);
+///   ret.val[1] = vld1q_u16(TableA +  8);
+///   ret.val[2] = vld1q_u16(TableA + 16);
+///   ret.val[3] = vld1q_u16(TableA + 24);
+///   return ret;
+/// }
+///
+/// The constants in that example are folded into the uses. Thus, 4 different
+/// constants are created.
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+/// Therefore the final assembly final has 4 different load.
+/// With this pass enabled, only one load is issued for the constants.
+class ARM64PromoteConstant : public ModulePass {
+
+public:
+  static char ID;
+  ARM64PromoteConstant() : ModulePass(ID) {}
+
+  virtual const char *getPassName() const { return "ARM64 Promote Constant"; }
+
+  /// Iterate over the functions and promote the interesting constants into
+  /// global variables with module scope.
+  bool runOnModule(Module &M) {
+    DEBUG(dbgs() << getPassName() << '\n');
+    bool Changed = false;
+    for (auto &MF: M) {
+      Changed |= runOnFunction(MF);
+    }
+    return Changed;
+  }
+
+private:
+  /// Look for interesting constants used within the given function.
+  /// Promote them into global variables, load these global variables within
+  /// the related function, so that the number of inserted load is minimal.
+  bool runOnFunction(Function &F);
+
+  // This transformation requires dominator info
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  /// Type to store a list of User
+  typedef SmallVector<Value::user_iterator, 4> Users;
+  /// Map an insertion point to all the uses it dominates.
+  typedef DenseMap<Instruction *, Users> InsertionPoints;
+  /// Map a function to the required insertion point of load for a
+  /// global variable
+  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
+
+  /// Find the closest point that dominates the given Use.
+  Instruction *findInsertionPoint(Value::user_iterator &Use);
+
+  /// Check if the given insertion point is dominated by an existing
+  /// insertion point.
+  /// If true, the given use is added to the list of dominated uses for
+  /// the related existing point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \return true if one of the insertion point in InsertPts dominates NewPt,
+  ///         false otherwise
+  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Check if the given insertion point can be merged with an existing
+  /// insertion point in a common dominator.
+  /// If true, the given use is added to the list of the created insertion
+  /// point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \pre isDominated returns false for the exact same parameters.
+  /// \return true if it exists an insertion point in InsertPts that could
+  ///         have been merged with NewPt in a common dominator,
+  ///         false otherwise
+  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Compute the minimal insertion points to dominates all the interesting
+  /// uses of value.
+  /// Insertion points are group per function and each insertion point
+  /// contains a list of all the uses it dominates within the related function
+  /// \param Val constant to be examined
+  /// \param[out] InsPtsPerFunc output storage of the analysis
+  void computeInsertionPoints(Constant *Val,
+                              InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Insert a definition of a new global variable at each point contained in
+  /// InsPtsPerFunc and update the related uses (also contained in
+  /// InsPtsPerFunc).
+  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Compute the minimal insertion points to dominate all the interesting
+  /// uses of Val and insert a definition of a new global variable
+  /// at these points.
+  /// Also update the uses of Val accordingly.
+  /// Currently a use of Val is considered interesting if:
+  /// - Val is not UndefValue
+  /// - Val is not zeroinitialized
+  /// - Replacing Val per a load of a global variable is valid.
+  /// \see shouldConvert for more details
+  bool computeAndInsertDefinitions(Constant *Val);
+
+  /// Promote the given constant into a global variable if it is expected to
+  /// be profitable.
+  /// \return true if Cst has been promoted
+  bool promoteConstant(Constant *Cst);
+
+  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Value::user_iterator &UseIt,
+                                             InsertionPoints::iterator &IPI,
+                                             InsertionPoints &InsertPts) {
+    // Record the dominated use
+    IPI->second.push_back(UseIt);
+    // Transfer the dominated uses of IPI to NewPt
+    // Inserting into the DenseMap may invalidate existing iterator.
+    // Keep a copy of the key to find the iterator to erase.
+    Instruction *OldInstr = IPI->first;
+    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    // Erase IPI
+    IPI = InsertPts.find(OldInstr);
+    InsertPts.erase(IPI);
+  }
+};
+} // end anonymous namespace
+
+char ARM64PromoteConstant::ID = 0;
+
+namespace llvm {
+void initializeARM64PromoteConstantPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const",
+                      "ARM64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const",
+                    "ARM64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createARM64PromoteConstantPass() {
+  return new ARM64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+  if (CstTy->isVectorTy())
+    return true;
+  if (CstTy->isStructTy()) {
+    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+         EltIdx < EndEltIdx; ++EltIdx)
+      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+        return true;
+  } else if (CstTy->isArrayTy())
+    return isConstantUsingVectorTy(CstTy->getArrayElementType());
+  return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+                             unsigned OpIdx) {
+  // shufflevector instruction expects a const for the mask argument, i.e., the
+  // third argument. Do not promote this use in that case.
+  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+    return false;
+
+  // extractvalue instruction expects a const idx
+  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // extractvalue instruction expects a const idx
+  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+    return false;
+
+  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant
+  if (isa<const LoadInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant
+  if (isa<const StoreInst>(Instr) && OpIdx > 1)
+    return false;
+
+  // Index must be constant
+  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Personality function and filters must be constant.
+  // Give up on that instruction.
+  if (isa<const LandingPadInst>(Instr))
+    return false;
+
+  // switch instruction expects constants to compare to
+  if (isa<const SwitchInst>(Instr))
+    return false;
+
+  // Expected address must be a constant
+  if (isa<const IndirectBrInst>(Instr))
+    return false;
+
+  // Do not mess with intrinsic
+  if (isa<const IntrinsicInst>(Instr))
+    return false;
+
+  // Do not mess with inline asm
+  const CallInst *CI = dyn_cast<const CallInst>(Instr);
+  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
+    return false;
+
+  return true;
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvert(const Constant *Cst) {
+  if (isa<const UndefValue>(Cst))
+    return false;
+
+  // FIXME: In some cases, it may be interesting to promote in memory
+  // a zero initialized constant.
+  // E.g., when the type of Cst require more instructions than the
+  // adrp/add/load sequence or when this sequence can be shared by several
+  // instances of Cst.
+  // Ideally, we could promote this into a global and rematerialize the constant
+  // when it was a bad idea.
+  if (Cst->isZeroValue())
+    return false;
+
+  if (Stress)
+    return true;
+
+  // FIXME: see function \todo
+  if (Cst->getType()->isVectorTy())
+    return false;
+  return isConstantUsingVectorTy(Cst->getType());
+}
+
+Instruction *
+ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+  // If this user is a phi, the insertion point is in the related
+  // incoming basic block
+  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
+  Instruction *InsertionPoint;
+  if (PhiInst)
+    InsertionPoint =
+        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  else
+    InsertionPoint = dyn_cast<Instruction>(*Use);
+  assert(InsertionPoint && "User is not an instruction!");
+  return InsertionPoint;
+}
+
+bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
+                                       Value::user_iterator &UseIt,
+                                       InsertionPoints &InsertPts) {
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+
+  // Traverse all the existing insertion point and check if one is dominating
+  // NewPt
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    if (NewPt == IPI->first || DT.dominates(IPI->first, NewPt) ||
+        // When IPI->first is a terminator instruction, DT may think that
+        // the result is defined on the edge.
+        // Here we are testing the insertion point, not the definition.
+        (IPI->first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI->first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point
+      // Record the dominated use
+      DEBUG(dbgs() << "Insertion point dominated by:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      IPI->second.push_back(UseIt);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                       Value::user_iterator &UseIt,
+                                       InsertionPoints &InsertPts) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+  BasicBlock *NewBB = NewPt->getParent();
+
+  // Traverse all the existing insertion point and check if one is dominated by
+  // NewPt and thus useless or can be combined with NewPt into a common
+  // dominator
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    BasicBlock *CurBB = IPI->first->getParent();
+    if (NewBB == CurBB) {
+      // Instructions are in the same block.
+      // By construction, NewPt is dominating the other.
+      // Indeed, isDominated returned false with the exact same arguments.
+      DEBUG(dbgs() << "Merge insertion point with:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+      return true;
+    }
+
+    // Look for a common dominator
+    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+    // If none exists, we cannot merge these two points
+    if (!CommonDominator)
+      continue;
+
+    if (CommonDominator != NewBB) {
+      // By construction, the CommonDominator cannot be CurBB
+      assert(CommonDominator != CurBB &&
+             "Instruction has not been rejected during isDominated check!");
+      // Take the last instruction of the CommonDominator as insertion point
+      NewPt = CommonDominator->getTerminator();
+    }
+    // else, CommonDominator is the block of NewBB, hence NewBB is the last
+    // possible insertion point in that block
+    DEBUG(dbgs() << "Merge insertion point with:\n");
+    DEBUG(IPI->first->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    DEBUG(NewPt->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+    return true;
+  }
+  return false;
+}
+
+void ARM64PromoteConstant::computeInsertionPoints(
+    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
+  DEBUG(dbgs() << "** Compute insertion points **\n");
+  for (Value::user_iterator UseIt = Val->user_begin(),
+                            EndUseIt = Val->user_end();
+       UseIt != EndUseIt; ++UseIt) {
+    // If the user is not an Instruction, we cannot modify it
+    if (!isa<Instruction>(*UseIt))
+      continue;
+
+    // Filter out uses that should not be converted
+    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+      continue;
+
+    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
+    DEBUG((*UseIt)->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    Instruction *InsertionPoint = findInsertionPoint(UseIt);
+
+    DEBUG(dbgs() << "Considered insertion point:\n");
+    DEBUG(InsertionPoint->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    // Check if the current insertion point is useless, i.e., it is dominated
+    // by another one.
+    InsertionPoints &InsertPts =
+        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
+    if (isDominated(InsertionPoint, UseIt, InsertPts))
+      continue;
+    // This insertion point is useful, check if we can merge some insertion
+    // point in a common dominator or if NewPt dominates an existing one.
+    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+      continue;
+
+    DEBUG(dbgs() << "Keep considered insertion point\n");
+
+    // It is definitely useful by its own
+    InsertPts[InsertionPoint].push_back(UseIt);
+  }
+}
+
+bool
+ARM64PromoteConstant::insertDefinitions(Constant *Cst,
+                                        InsertionPointsPerFunc &InsPtsPerFunc) {
+  // We will create one global variable per Module
+  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
+  bool HasChanged = false;
+
+  // Traverse all insertion points in all the function
+  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
+                                        EndIt = InsPtsPerFunc.end();
+       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
+    InsertionPoints &InsertPts = FctToInstPtsIt->second;
+// Do more check for debug purposes
+#ifndef NDEBUG
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+        *FctToInstPtsIt->first).getDomTree();
+#endif
+    GlobalVariable *PromotedGV;
+    assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+    Module *M = FctToInstPtsIt->first->getParent();
+    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
+        ModuleToMergedGV.find(M);
+    if (MapIt == ModuleToMergedGV.end()) {
+      PromotedGV = new GlobalVariable(
+          *M, Cst->getType(), true, GlobalValue::InternalLinkage, 0,
+          "_PromotedConst", 0, GlobalVariable::NotThreadLocal);
+      PromotedGV->setInitializer(Cst);
+      ModuleToMergedGV[M] = PromotedGV;
+      DEBUG(dbgs() << "Global replacement: ");
+      DEBUG(PromotedGV->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      ++NumPromoted;
+      HasChanged = true;
+    } else {
+      PromotedGV = MapIt->second;
+    }
+
+    for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                   EndIPI = InsertPts.end();
+         IPI != EndIPI; ++IPI) {
+      // Create the load of the global variable
+      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
+      DEBUG(dbgs() << "**********\n");
+      DEBUG(dbgs() << "New def: ");
+      DEBUG(LoadedCst->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+
+      // Update the dominated uses
+      Users &DominatedUsers = IPI->second;
+      for (Users::iterator UseIt = DominatedUsers.begin(),
+                           EndIt = DominatedUsers.end();
+           UseIt != EndIt; ++UseIt) {
+#ifndef NDEBUG
+        assert((DT.dominates(LoadedCst, cast<Instruction>(**UseIt)) ||
+                (isa<PHINode>(**UseIt) &&
+                 DT.dominates(LoadedCst, findInsertionPoint(*UseIt)))) &&
+               "Inserted definition does not dominate all its uses!");
+#endif
+        DEBUG(dbgs() << "Use to update " << UseIt->getOperandNo() << ":");
+        DEBUG((*UseIt)->print(dbgs()));
+        DEBUG(dbgs() << '\n');
+        (*UseIt)->setOperand(UseIt->getOperandNo(), LoadedCst);
+        ++NumPromotedUses;
+      }
+    }
+  }
+  return HasChanged;
+}
+
+bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+  InsertionPointsPerFunc InsertPtsPerFunc;
+  computeInsertionPoints(Val, InsertPtsPerFunc);
+  return insertDefinitions(Val, InsertPtsPerFunc);
+}
+
+bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
+  assert(Cst && "Given variable is not a valid constant.");
+
+  if (!shouldConvert(Cst))
+    return false;
+
+  DEBUG(dbgs() << "******************************\n");
+  DEBUG(dbgs() << "Candidate constant: ");
+  DEBUG(Cst->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  return computeAndInsertDefinitions(Cst);
+}
+
+bool ARM64PromoteConstant::runOnFunction(Function &F) {
+  // Look for instructions using constant vector
+  // Promote that constant to a global variable.
+  // Create as few load of this variable as possible and update the uses
+  // accordingly
+  bool LocalChange = false;
+  SmallSet<Constant *, 8> AlreadyChecked;
+
+  for (auto &MBB : F) {
+    for (auto &MI: MBB) {
+      // Traverse the operand, looking for constant vectors
+      // Replace them by a load of a global variable of type constant vector
+      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
+           OpIdx != EndOpIdx; ++OpIdx) {
+        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
+        // There is no point is promoting global value, they are already global.
+        // Do not promote constant expression, as they may require some code
+        // expansion.
+        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+            AlreadyChecked.insert(Cst))
+          LocalChange |= promoteConstant(Cst);
+      }
+    }
+  }
+  return LocalChange;
+}
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.cpp b/lib/Target/ARM64/ARM64RegisterInfo.cpp
new file mode 100644
index 0000000..4c7fc8a
--- /dev/null
+++ b/lib/Target/ARM64/ARM64RegisterInfo.cpp
@@ -0,0 +1,400 @@
+//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64RegisterInfo.h"
+#include "ARM64FrameLowering.h"
+#include "ARM64InstrInfo.h"
+#include "ARM64Subtarget.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "ARM64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii,
+                                     const ARM64Subtarget *sti)
+    : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {}
+
+const uint16_t *
+ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_ARM64_AllRegs_SaveList;
+  else
+    return CSR_ARM64_AAPCS_SaveList;
+}
+
+const uint32_t *
+ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_ARM64_AllRegs_RegMask;
+  else
+    return CSR_ARM64_AAPCS_RegMask;
+}
+
+const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_ARM64_TLS_Darwin_RegMask;
+
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_ARM64_TLS_ELF_RegMask;
+}
+
+const uint32_t *
+ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_ARM64_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // FIXME: avoid re-calculating this everytime.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM64::SP);
+  Reserved.set(ARM64::XZR);
+  Reserved.set(ARM64::WSP);
+  Reserved.set(ARM64::WZR);
+
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(ARM64::FP);
+    Reserved.set(ARM64::W29);
+  }
+
+  if (STI->isTargetDarwin()) {
+    Reserved.set(ARM64::X18); // Platform register
+    Reserved.set(ARM64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(ARM64::X19);
+    Reserved.set(ARM64::W19);
+  }
+
+  return Reserved;
+}
+
+bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (Reg) {
+  default:
+    break;
+  case ARM64::SP:
+  case ARM64::XZR:
+  case ARM64::WSP:
+  case ARM64::WZR:
+    return true;
+  case ARM64::X18:
+  case ARM64::W18:
+    return STI->isTargetDarwin();
+  case ARM64::FP:
+  case ARM64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case ARM64::W19:
+  case ARM64::X19:
+    return hasBasePointer(MF);
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *
+ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &ARM64::GPR64RegClass;
+}
+
+const TargetRegisterClass *
+ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM64::CCRRegClass)
+    return NULL; // Can't copy CPSR.
+  return RC;
+}
+
+unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; }
+
+bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP;
+}
+
+bool
+ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF)
+    const {
+  return true;
+}
+
+bool
+ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // ARM64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
+    const {
+  return true;
+}
+
+bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                          int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                           int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                     unsigned BaseReg,
+                                                     int FrameIdx,
+                                                     int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(ARM64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
+}
+
+void ARM64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                          int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARM64FrameLowering *TFI = static_cast<const ARM64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM64::GPR32RegClassID:
+  case ARM64::GPR32spRegClassID:
+  case ARM64::GPR32allRegClassID:
+  case ARM64::GPR64spRegClassID:
+  case ARM64::GPR64allRegClassID:
+  case ARM64::GPR64RegClassID:
+  case ARM64::GPR32commonRegClassID:
+  case ARM64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case ARM64::FPR8RegClassID:
+  case ARM64::FPR16RegClassID:
+  case ARM64::FPR32RegClassID:
+  case ARM64::FPR64RegClassID:
+  case ARM64::FPR128RegClassID:
+    return 32;
+
+  case ARM64::DDRegClassID:
+  case ARM64::DDDRegClassID:
+  case ARM64::DDDDRegClassID:
+  case ARM64::QQRegClassID:
+  case ARM64::QQQRegClassID:
+  case ARM64::QQQQRegClassID:
+    return 32;
+
+  case ARM64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.h b/lib/Target/ARM64/ARM64RegisterInfo.h
new file mode 100644
index 0000000..31d9242
--- /dev/null
+++ b/lib/Target/ARM64/ARM64RegisterInfo.h
@@ -0,0 +1,101 @@
+//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64REGISTERINFO_H
+#define LLVM_TARGET_ARM64REGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "ARM64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class ARM64InstrInfo;
+class ARM64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+
+struct ARM64RegisterInfo : public ARM64GenRegisterInfo {
+private:
+  const ARM64InstrInfo *TII;
+  const ARM64Subtarget *STI;
+
+public:
+  ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti);
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  /// Code Generation virtual methods...
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+
+  unsigned getCSRFirstUseCost() const {
+    // The cost will be compared against BlockFrequency where entry has the
+    // value of 1 << 14. A value of 5 will choose to spill or split really
+    // cold path instead of using a callee-saved register.
+    return 5;
+  }
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_ARM64REGISTERINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.td b/lib/Target/ARM64/ARM64RegisterInfo.td
new file mode 100644
index 0000000..96001c5
--- /dev/null
+++ b/lib/Target/ARM64/ARM64RegisterInfo.td
@@ -0,0 +1,561 @@
+//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+class ARM64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
+  let HWEncoding = enc;
+  let Namespace = "ARM64";
+  let SubRegs = subregs;
+}
+
+let Namespace = "ARM64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "ARM64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+def W0    : ARM64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : ARM64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : ARM64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : ARM64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : ARM64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : ARM64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : ARM64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : ARM64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : ARM64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : ARM64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : ARM64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : ARM64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : ARM64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : ARM64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : ARM64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : ARM64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : ARM64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : ARM64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : ARM64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : ARM64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : ARM64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : ARM64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : ARM64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : ARM64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : ARM64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : ARM64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : ARM64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : ARM64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : ARM64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : ARM64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : ARM64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : ARM64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : ARM64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : ARM64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : ARM64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : ARM64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : ARM64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : ARM64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : ARM64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : ARM64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : ARM64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : ARM64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : ARM64Reg<29, "fp",  [W29]>, DwarfRegAlias<W29>;
+def LR    : ARM64Reg<30, "lr",  [W30]>, DwarfRegAlias<W30>;
+def SP    : ARM64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
+}
+
+// Condition code register.
+def CPSR  : ARM64Reg<0, "cpsr">;
+
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"ARM64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28)>;
+
+// GPR register classes for post increment ammount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand1">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand2">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand3">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand4">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand6">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand8">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand12">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand16">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand24">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand32">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand48">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand64">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"ARM64", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
+
+def B0    : ARM64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : ARM64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : ARM64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : ARM64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : ARM64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : ARM64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : ARM64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : ARM64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : ARM64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : ARM64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : ARM64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : ARM64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : ARM64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : ARM64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : ARM64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : ARM64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : ARM64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : ARM64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : ARM64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : ARM64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : ARM64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : ARM64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : ARM64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : ARM64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : ARM64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : ARM64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : ARM64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : ARM64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : ARM64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : ARM64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : ARM64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : ARM64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : ARM64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : ARM64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : ARM64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : ARM64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : ARM64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : ARM64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : ARM64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : ARM64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : ARM64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : ARM64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [hsub] in {
+def S0    : ARM64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : ARM64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : ARM64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : ARM64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : ARM64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : ARM64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : ARM64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : ARM64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : ARM64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : ARM64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : ARM64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : ARM64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : ARM64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : ARM64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : ARM64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : ARM64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : ARM64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : ARM64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : ARM64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : ARM64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : ARM64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : ARM64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : ARM64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : ARM64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : ARM64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : ARM64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : ARM64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : ARM64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : ARM64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : ARM64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8  : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
+}
+def FPR16 : RegisterClass<"ARM64", [untyped], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"ARM64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"ARM64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
+
+
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorRegAsmOperand : AsmOperandClass { let Name = "VectorReg"; }
+let ParserMatchClass = VectorRegAsmOperand in {
+def V64  : RegisterOperand<FPR64, "printVRegOperand">;
+def V128 : RegisterOperand<FPR128, "printVRegOperand">;
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand">;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
+
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
+
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
+
+  // 64-bit register lists with explicit type.
+
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
+
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
+
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
+
+  // 128-bit register lists with explicit type
+
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
+
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
+
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
+
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
+
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
+
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+  }
+
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+  }
+
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
+}
+
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/ARM64/ARM64SchedCyclone.td
new file mode 100644
index 0000000..65c68b3
--- /dev/null
+++ b/lib/Target/ARM64/ARM64SchedCyclone.td
@@ -0,0 +1,852 @@
+//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4; // Optimistic load latency.
+  let MispredictPenalty = 16; // 14-19 cycles are typical.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+  let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+  let Super  = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+  let Super  = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+  let Super = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+  let Super = CyUnitBR;
+  let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+  let Super = CyUnitB;
+  let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+  let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+  let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+  let Super = CyUnitV;
+  let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+  let Super = CyUnitV;
+  let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+  let Super = CyUnitVM;
+  let BufferSize = 16;
+}
+
+// 2 fp division/square-root units.  These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteImmZ  : SchedWriteVariant<[
+                   SchedVar<WriteZPred, [WriteX]>,
+                   SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteMov      : SchedWriteVariant<[
+                      SchedVar<WriteIMovPred, [WriteX]>,
+                      SchedVar<WriteVMovPred, [WriteX]>,
+                      SchedVar<NoSchedPred,   [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+//              Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+  let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+  let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 13;
+  let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map ARM64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map ARM64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map ARM64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+  let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov    : SchedWriteVariant<[
+                     SchedVar<WriteVMovPred, [WriteX]>,
+                     SchedVar<NoSchedPred,   [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : SchedAlias<WriteFCopy, WriteVLD>;
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum  : SchedReadAdvance<1,
+                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+                                     "CMLEv","CMLTv",
+                                     "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+                                       "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+                                     "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
+                                  FADDPv2f32,FADDPv4f32,
+                                  FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+                                  FSUBDrr,FSUBv2f64,
+                                  FADDPv2f64,
+                                  FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+                                     "FMAXS","FMAXD","FMAXv",
+                                     "FMINS","FMIND","FMINv",
+                                     "FMAXNMS","FMAXNMD","FMAXNMv",
+                                     "FMINNMS","FMINNMD","FMINNMv",
+                                     "FMAXPv2f","FMAXPv4f",
+                                     "FMINPv2f","FMINPv4f",
+                                     "FMAXNMPv2f","FMAXNMPv4f",
+                                     "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc  : SchedReadAdvance<1,
+                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+             (instregex "RSHRNv","SHRNv",
+                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+              "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+              FMLAv2f32,FMLAv4f32,
+              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+              FMLAv2f64,FMLAv2i64_indexed,
+              FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+                                       SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+                                       SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+             (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+             (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+} // SchedModel = CycloneModel
diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td
new file mode 100644
index 0000000..52f9262
--- /dev/null
+++ b/lib/Target/ARM64/ARM64Schedule.td
@@ -0,0 +1,92 @@
+//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const ARM64InstrInfo *TII =
+    static_cast<const ARM64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// ARM64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
new file mode 100644
index 0000000..79d507f
--- /dev/null
+++ b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
@@ -0,0 +1,57 @@
+//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-selectiondag-info"
+#include "ARM64TargetMachine.h"
+using namespace llvm;
+
+ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM)
+    : TargetSelectionDAGInfo(TM),
+      Subtarget(&TM.getSubtarget<ARM64Subtarget>()) {}
+
+ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {}
+
+SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : 0;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const ARM64TargetLowering &TLI = *static_cast<const ARM64TargetLowering *>(
+                                          DAG.getTarget().getTargetLowering());
+
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(
+        Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
+        0, CallingConv::C, /*isTailCall=*/false,
+        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
+        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
+}
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.h b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
new file mode 100644
index 0000000..770775f
--- /dev/null
+++ b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
@@ -0,0 +1,37 @@
+//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM64 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64SELECTIONDAGINFO_H
+#define ARM64SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARM64Subtarget *Subtarget;
+
+public:
+  explicit ARM64SelectionDAGInfo(const TargetMachine &TM);
+  ~ARM64SelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                                  SDValue Dst, SDValue Src, SDValue Size,
+                                  unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
+}
+
+#endif
diff --git a/lib/Target/ARM64/ARM64StorePairSuppress.cpp b/lib/Target/ARM64/ARM64StorePairSuppress.cpp
new file mode 100644
index 0000000..6521d13
--- /dev/null
+++ b/lib/Target/ARM64/ARM64StorePairSuppress.cpp
@@ -0,0 +1,167 @@
+//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-stp-suppress"
+#include "ARM64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class ARM64StorePairSuppress : public MachineFunctionPass {
+  const ARM64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  MachineFunction *MF;
+  TargetSchedModel SchedModel;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+  static char ID;
+  ARM64StorePairSuppress() : MachineFunctionPass(ID) {}
+
+  virtual const char *getPassName() const override {
+    return "ARM64 Store Pair Suppression";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+  bool isNarrowFPStore(const MachineInstr &MI);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineTraceMetrics>();
+    AU.addPreserved<MachineTraceMetrics>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char ARM64StorePairSuppress::ID = 0;
+} // anonymous
+
+FunctionPass *llvm::createARM64StorePairSuppressPass() {
+  return new ARM64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+  unsigned ResLength = BBTrace.getResourceLength();
+
+  // Get the machine model's scheduling class for STPQi.
+  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+  unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+  // If a subtarget does not define resources for STPQi, bail here.
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    if (ResLenWithSTP > ResLength) {
+      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                   << " resources " << ResLength << " -> " << ResLenWithSTP
+                   << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer.
+bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case ARM64::STRSui:
+  case ARM64::STRDui:
+  case ARM64::STURSi:
+  case ARM64::STURDi:
+    return true;
+  }
+}
+
+bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const ARM64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST =
+      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+
+  if (!SchedModel.hasInstrSchedModel()) {
+    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    return false;
+  }
+
+  // Check for a sequence of stores to the same base address. We don't need to
+  // precisely determine whether a store pair can be formed. But we do want to
+  // filter out most situations where we can't form store pairs to avoid
+  // computing trace metrics in those cases.
+  for (auto &MBB: *MF) {
+    bool SuppressSTP = false;
+    unsigned PrevBaseReg = 0;
+    for (auto &MI: MBB) {
+      if (!isNarrowFPStore(MI))
+        continue;
+      unsigned BaseReg;
+      unsigned Offset;
+      if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+        if (PrevBaseReg == BaseReg) {
+          // If this block can take STPs, skip ahead to the next block.
+          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+            break;
+          // Otherwise, continue unpairing the stores in this block.
+          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+          SuppressSTP = true;
+          TII->suppressLdStPair(&MI);
+        }
+        PrevBaseReg = BaseReg;
+      } else
+        PrevBaseReg = 0;
+    }
+  }
+  // This pass just sets some internal MachineMemOperand flags. It can't really
+  // invalidate anything.
+  return false;
+}
diff --git a/lib/Target/ARM64/ARM64Subtarget.cpp b/lib/Target/ARM64/ARM64Subtarget.cpp
new file mode 100644
index 0000000..14b5444
--- /dev/null
+++ b/lib/Target/ARM64/ARM64Subtarget.cpp
@@ -0,0 +1,100 @@
+//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64InstrInfo.h"
+#include "ARM64Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "ARM64GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS)
+    : ARM64GenSubtargetInfo(TT, CPU, FS), HasZeroCycleRegMove(false),
+      HasZeroCycleZeroing(false), CPUString(CPU), TargetTriple(TT) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    // We default to Cyclone for now.
+    CPUString = "cyclone";
+
+  ParseSubtargetFeatures(CPUString, FS);
+}
+
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return ARM64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB). Therefore they must use the
+  // GOT.
+  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+    return ARM64II::MO_GOT;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+
+  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+  //   + On MachO, if the symbol is defined in this module the GOT can be
+  //     skipped.
+  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+  //     defined could end up in unexpected places. Use a GOT.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+    if (isTargetMachO())
+      return (isDecl || GV->isWeakForLinker()) ? ARM64II::MO_GOT
+                                               : ARM64II::MO_NO_FLAG;
+    else
+      return ARM64II::MO_GOT;
+  }
+
+  return ARM64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *ARM64Subtarget::getBZeroEntry() const {
+  // At the moment, always prefer bzero.
+  return "bzero";
+}
+
+void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
diff --git a/lib/Target/ARM64/ARM64Subtarget.h b/lib/Target/ARM64/ARM64Subtarget.h
new file mode 100644
index 0000000..1cbd79e
--- /dev/null
+++ b/lib/Target/ARM64/ARM64Subtarget.h
@@ -0,0 +1,87 @@
+//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64SUBTARGET_H
+#define ARM64SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "ARM64RegisterInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "ARM64GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class ARM64Subtarget : public ARM64GenSubtargetInfo {
+protected:
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ARM64Subtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS);
+
+  bool enableMachineScheduler() const override { return true; }
+
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end, unsigned NumRegionInstrs) const;
+};
+} // End llvm namespace
+
+#endif // ARM64SUBTARGET_H
diff --git a/lib/Target/ARM64/ARM64TargetMachine.cpp b/lib/Target/ARM64/ARM64TargetMachine.cpp
new file mode 100644
index 0000000..101dc25
--- /dev/null
+++ b/lib/Target/ARM64/ARM64TargetMachine.cpp
@@ -0,0 +1,157 @@
+//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableCCMP("arm64-ccmp",
+                                cl::desc("Enable the CCMP formation pass"),
+                                cl::init(true));
+
+static cl::opt<bool> EnableStPairSuppress("arm64-stp-suppress", cl::Hidden,
+                                          cl::desc("Suppress STP for ARM64"),
+                                          cl::init(true));
+
+static cl::opt<bool>
+EnablePromoteConstant("arm64-promote-const", cl::Hidden,
+                      cl::desc("Enable the promote constant pass"),
+                      cl::init(true));
+
+static cl::opt<bool>
+EnableCollectLOH("arm64-collect-loh", cl::Hidden,
+                 cl::desc("Enable the pass that emits the linker"
+                          " optimization hints (LOH)"),
+                 cl::init(true));
+
+extern "C" void LLVMInitializeARM64Target() {
+  // Register the target.
+  RegisterTargetMachine<ARM64TargetMachine> X(TheARM64Target);
+}
+
+/// TargetMachine ctor - Create an ARM64 architecture model.
+///
+ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS),
+      DL(Subtarget.isTargetMachO() ? "e-m:o-i64:64-i128:128-n32:64-S128"
+                                   : "e-m:e-i64:64-i128:128-n32:64-S128"),
+      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
+      TSInfo(*this) {
+  initAsmInfo();
+}
+
+namespace {
+/// ARM64 Code Generator Pass Configuration Options.
+class ARM64PassConfig : public TargetPassConfig {
+public:
+  ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  ARM64TargetMachine &getARM64TargetMachine() const {
+    return getTM<ARM64TargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addILPOpts();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our ARM64 pass. This
+  // allows the ARM64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createARM64TargetTransformInfoPass(this));
+}
+
+TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new ARM64PassConfig(this, PM);
+}
+
+// Pass Pipeline Configuration
+bool ARM64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createARM64PromoteConstantPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createARM64AddressTypePromotionPass());
+  return false;
+}
+
+bool ARM64PassConfig::addInstSelector() {
+  addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<ARM64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createARM64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
+
+bool ARM64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createARM64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createARM64StorePairSuppressPass());
+  return true;
+}
+
+bool ARM64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  addPass(createARM64AdvSIMDScalar());
+  return true;
+}
+
+bool ARM64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  addPass(createARM64DeadRegisterDefinitions());
+  return true;
+}
+
+bool ARM64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createARM64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  addPass(createARM64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool ARM64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createARM64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH)
+    addPass(createARM64CollectLOHPass());
+  return true;
+}
diff --git a/lib/Target/ARM64/ARM64TargetMachine.h b/lib/Target/ARM64/ARM64TargetMachine.h
new file mode 100644
index 0000000..8274550
--- /dev/null
+++ b/lib/Target/ARM64/ARM64TargetMachine.h
@@ -0,0 +1,69 @@
+//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64TARGETMACHINE_H
+#define ARM64TARGETMACHINE_H
+
+#include "ARM64InstrInfo.h"
+#include "ARM64ISelLowering.h"
+#include "ARM64Subtarget.h"
+#include "ARM64FrameLowering.h"
+#include "ARM64SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class ARM64TargetMachine : public LLVMTargetMachine {
+protected:
+  ARM64Subtarget Subtarget;
+
+private:
+  const DataLayout DL;
+  ARM64InstrInfo InstrInfo;
+  ARM64TargetLowering TLInfo;
+  ARM64FrameLowering FrameLowering;
+  ARM64SelectionDAGInfo TSInfo;
+
+public:
+  ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  const ARM64Subtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const ARM64TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const ARM64FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const ARM64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const ARM64RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const ARM64SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  /// \brief Register ARM64 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.cpp b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
new file mode 100644
index 0000000..cde01e5
--- /dev/null
+++ b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
@@ -0,0 +1,52 @@
+//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64TargetObjectFile.h"
+#include "ARM64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                           const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.h b/lib/Target/ARM64/ARM64TargetObjectFile.h
new file mode 100644
index 0000000..62446f9
--- /dev/null
+++ b/lib/Target/ARM64/ARM64TargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class ARM64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
new file mode 100644
index 0000000..9b598d7
--- /dev/null
+++ b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
@@ -0,0 +1,326 @@
+//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// ARM64 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64tti"
+#include "ARM64.h"
+#include "ARM64TargetMachine.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeARM64TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const ARM64TargetMachine *TM;
+  const ARM64Subtarget *ST;
+  const ARM64TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  ARM64TTI(const ARM64TargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeARM64TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { pushTTIStack(this); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const override {
+    if (Vector)
+      return 32;
+
+    return 31;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const override {
+    if (Vector)
+      return 128;
+
+    return 64;
+  }
+
+  unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
+                   "ARM64 Target Transform Info", true, true, false)
+char ARM64TTI::ID = 0;
+
+ImmutablePass *
+llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
+  return new ARM64TTI(TM);
+}
+
+unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
+    return 1;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift == 0) ? 1 : Shift;
+}
+
+ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
diff --git a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
new file mode 100644
index 0000000..38a61d8
--- /dev/null
+++ b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
@@ -0,0 +1,4832 @@
+//===-- ARM64AsmParser.cpp - Parse ARM64 assembly to MCInst instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+
+class ARM64Operand;
+
+class ARM64AsmParser : public MCTargetAsmParser {
+public:
+  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
+
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  unsigned parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind);
+  bool parseOptionalShift(OperandVector &Operands);
+  bool parseOptionalExtend(OperandVector &Operands);
+  bool parseRegister(OperandVector &Operands);
+  bool parseMemory(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo, bool MatchingInlineAsm);
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "ARM64GenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy tryParseNoIndexMemory(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseCPSRField(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+
+public:
+  enum ARM64MatchResultTy {
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARM64GenAsmMatcher.inc"
+  };
+  ARM64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                 const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+  }
+
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc, OperandVector &Operands);
+  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  virtual bool ParseDirective(AsmToken DirectiveID);
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                ARM64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                const MCConstantExpr *&Addend);
+};
+} // end anonymous namespace
+
+namespace {
+
+/// ARM64Operand - Instances of this class represent a parsed ARM64 machine
+/// instruction.
+class ARM64Operand : public MCParsedAsmOperand {
+public:
+  enum MemIdxKindTy {
+    ImmediateOffset, // pre-indexed, no writeback
+    RegisterOffset   // register offset, with optional extend
+  };
+
+private:
+  enum KindTy {
+    k_Immediate,
+    k_Memory,
+    k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysCR,
+    k_Prefetch,
+    k_Shifter,
+    k_Extend,
+    k_FPImm,
+    k_Barrier,
+    k_SystemRegister,
+    k_CPSRField
+  } Kind;
+
+  SMLoc StartLoc, EndLoc, OffsetLoc;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
+  };
+
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    unsigned Val; // Not the enum since not all values have names.
+  };
+
+  struct SystemRegisterOp {
+    // 16-bit immediate, usually from the ARM64SYS::SystermRegister enum,
+    // but not limited to those values.
+    uint16_t Val;
+  };
+
+  struct CPSRFieldOp {
+    ARM64SYS::CPSRField Field;
+  };
+
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    unsigned Val;
+  };
+
+  struct ShifterOp {
+    unsigned Val;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
+  };
+
+  // This is for all forms of ARM64 address expressions
+  struct MemOp {
+    unsigned BaseRegNum, OffsetRegNum;
+    ARM64_AM::ExtendType ExtType;
+    unsigned ShiftVal;
+    bool ExplicitShift;
+    const MCExpr *OffsetImm;
+    MemIdxKindTy Mode;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
+    struct SystemRegisterOp SystemRegister;
+    struct CPSRFieldOp CPSRField;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct ShifterOp Shifter;
+    struct ExtendOp Extend;
+    struct MemOp Mem;
+  };
+
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
+
+  ARM64Operand(KindTy K, MCContext &_Ctx)
+      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
+
+public:
+  ARM64Operand(const ARM64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_SystemRegister:
+      SystemRegister = o.SystemRegister;
+      break;
+    case k_CPSRField:
+      CPSRField = o.CPSRField;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_Memory:
+      Mem = o.Mem;
+      break;
+    case k_Shifter:
+      Shifter = o.Shifter;
+      break;
+    case k_Extend:
+      Extend = o.Extend;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+  /// getOffsetLoc - Get the location of the offset of this memory operand.
+  SMLoc getOffsetLoc() const { return OffsetLoc; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == k_Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
+  }
+
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
+  }
+
+  uint16_t getSystemRegister() const {
+    assert(Kind == k_SystemRegister && "Invalid access!");
+    return SystemRegister.Val;
+  }
+
+  ARM64SYS::CPSRField getCPSRField() const {
+    assert(Kind == k_CPSRField && "Invalid access!");
+    return CPSRField.Field;
+  }
+
+  unsigned getReg() const {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
+  }
+
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
+  }
+
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
+  }
+
+  unsigned getShifter() const {
+    assert(Kind == k_Shifter && "Invalid access!");
+    return Shifter.Val;
+  }
+
+  unsigned getExtend() const {
+    assert(Kind == k_Extend && "Invalid access!");
+    return Extend.Val;
+  }
+
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isSImm9() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
+  }
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+  }
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+  }
+  bool isSImm7s16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
+  }
+  bool isImm1_8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
+  }
+  bool isImm0_15() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
+  }
+  bool isImm1_16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
+  }
+  bool isImm0_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
+  }
+  bool isImm1_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
+  }
+  bool isImm1_32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
+  }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 32);
+  }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isSIMDImmType10() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return ARM64_AM::isAdvSIMDModImmType10(MCE->getValue());
+  }
+  bool isBranchTarget26() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isBranchTarget19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+  }
+
+  bool isMovWSymbol(ArrayRef<ARM64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
+      return false;
+
+    ARM64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    const MCConstantExpr *Addend;
+    if (!ARM64AsmParser::classifySymbolRef(getImm(), ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+      return false;
+
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
+
+    return false;
+  }
+
+  bool isMovZSymbolG3() const {
+    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG2() const {
+    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2,
+                                                   ARM64MCExpr::VK_TPREL_G2,
+                                                   ARM64MCExpr::VK_DTPREL_G2 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG1() const {
+    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G1,
+                                                   ARM64MCExpr::VK_GOTTPREL_G1,
+                                                   ARM64MCExpr::VK_TPREL_G1,
+                                                   ARM64MCExpr::VK_DTPREL_G1, };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG0() const {
+    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G0,
+                                                   ARM64MCExpr::VK_TPREL_G0,
+                                                   ARM64MCExpr::VK_DTPREL_G0 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG2() const {
+    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2_NC };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG1() const {
+    static ARM64MCExpr::VariantKind Variants[] = {
+      ARM64MCExpr::VK_ABS_G1_NC, ARM64MCExpr::VK_TPREL_G1_NC,
+      ARM64MCExpr::VK_DTPREL_G1_NC
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG0() const {
+    static ARM64MCExpr::VariantKind Variants[] = {
+      ARM64MCExpr::VK_ABS_G0_NC,   ARM64MCExpr::VK_GOTTPREL_G0_NC,
+      ARM64MCExpr::VK_TPREL_G0_NC, ARM64MCExpr::VK_DTPREL_G0_NC
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSystemRegister() const {
+    if (Kind == k_SystemRegister)
+      return true;
+    // SPSel is legal for both the system register and the CPSR-field
+    // variants of MSR, so special case that. Fugly.
+    return (Kind == k_CPSRField && getCPSRField() == ARM64SYS::cpsr_SPSel);
+  }
+  bool isSystemCPSRField() const { return Kind == k_CPSRField; }
+  bool isReg() const { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
+  }
+
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
+      return false;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
+  }
+
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
+  }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isMem() const { return Kind == k_Memory; }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isShifter() const { return Kind == k_Shifter; }
+  bool isExtend() const {
+    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
+    if (isShifter()) {
+      ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
+      return ST == ARM64_AM::LSL;
+    }
+    return Kind == k_Extend;
+  }
+  bool isExtend64() const {
+    if (Kind != k_Extend)
+      return false;
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    ARM64_AM::ExtendType ET = ARM64_AM::getArithExtendType(Extend.Val);
+    return ET != ARM64_AM::UXTX && ET != ARM64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
+    if (isShifter()) {
+      ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
+      return ST == ARM64_AM::LSL;
+    }
+    if (Kind != k_Extend)
+      return false;
+    ARM64_AM::ExtendType ET = ARM64_AM::getArithExtendType(Extend.Val);
+    return ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX;
+  }
+
+  bool isArithmeticShifter() const {
+    if (!isShifter())
+      return false;
+
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
+    return ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR;
+  }
+
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
+    if (ST != ARM64_AM::LSL)
+      return false;
+    uint64_t Val = ARM64_AM::getShiftValue(Shifter.Val);
+    return (Val == 0 || Val == 16);
+  }
+
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0 or 16.
+    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
+    if (ST != ARM64_AM::LSL)
+      return false;
+    uint64_t Val = ARM64_AM::getShiftValue(Shifter.Val);
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
+  }
+
+  bool isAddSubShifter() const {
+    if (!isShifter())
+      return false;
+
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    unsigned Val = Shifter.Val;
+    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
+           (ARM64_AM::getShiftValue(Val) == 0 ||
+            ARM64_AM::getShiftValue(Val) == 12);
+  }
+
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Val = Shifter.Val;
+    unsigned Shift = ARM64_AM::getShiftValue(Val);
+    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
+  }
+
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Val = Shifter.Val;
+    unsigned Shift = ARM64_AM::getShiftValue(Val);
+    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
+  }
+
+  bool isMoveVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Val = Shifter.Val;
+    unsigned Shift = ARM64_AM::getShiftValue(Val);
+    return ARM64_AM::getShiftType(Val) == ARM64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
+
+  bool isMemoryRegisterOffset8() const {
+    return isMem() && Mem.Mode == RegisterOffset && Mem.ShiftVal == 0;
+  }
+
+  bool isMemoryRegisterOffset16() const {
+    return isMem() && Mem.Mode == RegisterOffset &&
+           (Mem.ShiftVal == 0 || Mem.ShiftVal == 1);
+  }
+
+  bool isMemoryRegisterOffset32() const {
+    return isMem() && Mem.Mode == RegisterOffset &&
+           (Mem.ShiftVal == 0 || Mem.ShiftVal == 2);
+  }
+
+  bool isMemoryRegisterOffset64() const {
+    return isMem() && Mem.Mode == RegisterOffset &&
+           (Mem.ShiftVal == 0 || Mem.ShiftVal == 3);
+  }
+
+  bool isMemoryRegisterOffset128() const {
+    return isMem() && Mem.Mode == RegisterOffset &&
+           (Mem.ShiftVal == 0 || Mem.ShiftVal == 4);
+  }
+
+  bool isMemoryUnscaled() const {
+    if (!isMem())
+      return false;
+    if (Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+    // Make sure the immediate value is valid.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+    if (!CE)
+      return false;
+    // The offset must fit in a signed 9-bit unscaled immediate.
+    int64_t Value = CE->getValue();
+    return (Value >= -256 && Value < 256);
+  }
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  bool isMemoryUnscaledFB8() const {
+    return isMemoryUnscaled() && !isMemoryIndexed8();
+  }
+  bool isMemoryUnscaledFB16() const {
+    return isMemoryUnscaled() && !isMemoryIndexed16();
+  }
+  bool isMemoryUnscaledFB32() const {
+    return isMemoryUnscaled() && !isMemoryIndexed32();
+  }
+  bool isMemoryUnscaledFB64() const {
+    return isMemoryUnscaled() && !isMemoryIndexed64();
+  }
+  bool isMemoryUnscaledFB128() const {
+    return isMemoryUnscaled() && !isMemoryIndexed128();
+  }
+  bool isMemoryIndexed(unsigned Scale) const {
+    if (!isMem())
+      return false;
+    if (Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+    // Make sure the immediate value is valid.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+
+    if (CE) {
+      // The offset must be a positive multiple of the scale and in range of
+      // encoding with a 12-bit immediate.
+      int64_t Value = CE->getValue();
+      return (Value >= 0 && (Value % Scale) == 0 && Value <= (4095 * Scale));
+    }
+
+    // If it's not a constant, check for some expressions we know.
+    const MCExpr *Expr = Mem.OffsetImm;
+    ARM64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    const MCConstantExpr *Addend;
+    if (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
+    }
+
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == ARM64MCExpr::VK_LO12 ||
+        ELFRefKind == ARM64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == ARM64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      int64_t Value = Addend ? Addend->getValue() : 0;
+      return Value >= 0 && (Value % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
+    }
+
+    return false;
+  }
+  bool isMemoryIndexed128() const { return isMemoryIndexed(16); }
+  bool isMemoryIndexed64() const { return isMemoryIndexed(8); }
+  bool isMemoryIndexed32() const { return isMemoryIndexed(4); }
+  bool isMemoryIndexed16() const { return isMemoryIndexed(2); }
+  bool isMemoryIndexed8() const { return isMemoryIndexed(1); }
+  bool isMemoryNoIndex() const {
+    if (!isMem())
+      return false;
+    if (Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+
+    // Make sure the immediate value is valid. Only zero is allowed.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+    if (!CE || CE->getValue() != 0)
+      return false;
+    return true;
+  }
+  bool isMemorySIMDNoIndex() const {
+    if (!isMem())
+      return false;
+    if (Mem.Mode != ImmediateOffset)
+      return false;
+    return Mem.OffsetImm == 0;
+  }
+  bool isMemoryIndexedSImm9() const {
+    if (!isMem() || Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+    assert(CE && "Non-constant pre-indexed offset!");
+    int64_t Value = CE->getValue();
+    return Value >= -256 && Value <= 255;
+  }
+  bool isMemoryIndexed32SImm7() const {
+    if (!isMem() || Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+    assert(CE && "Non-constant pre-indexed offset!");
+    int64_t Value = CE->getValue();
+    return ((Value % 4) == 0) && Value >= -256 && Value <= 252;
+  }
+  bool isMemoryIndexed64SImm7() const {
+    if (!isMem() || Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+    assert(CE && "Non-constant pre-indexed offset!");
+    int64_t Value = CE->getValue();
+    return ((Value % 8) == 0) && Value >= -512 && Value <= 504;
+  }
+  bool isMemoryIndexed128SImm7() const {
+    if (!isMem() || Mem.Mode != ImmediateOffset)
+      return false;
+    if (!Mem.OffsetImm)
+      return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+    assert(CE && "Non-constant pre-indexed offset!");
+    int64_t Value = CE->getValue();
+    return ((Value % 16) == 0) && Value >= -1024 && Value <= 1008;
+  }
+
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    return isImm();
+  }
+
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    return isImm();
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addVectorRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { ARM64::D0,       ARM64::D0_D1,
+                                    ARM64::D0_D1_D2, ARM64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { ARM64::Q0,       ARM64::Q0_Q1,
+                                    ARM64::Q0_Q1_Q2, ARM64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
+  }
+
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
+  }
+
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
+  }
+
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
+  }
+
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
+  }
+
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid immediate operand!");
+    uint64_t encoding = ARM64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addBranchTarget19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
+
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
+  }
+
+  void addSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == k_SystemRegister)
+      Inst.addOperand(MCOperand::CreateImm(getSystemRegister()));
+    else {
+      assert(Kind == k_CPSRField && getCPSRField() == ARM64SYS::cpsr_SPSel);
+      Inst.addOperand(MCOperand::CreateImm(ARM64SYS::SPSel));
+    }
+  }
+
+  void addSystemCPSRFieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCPSRField()));
+  }
+
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
+  }
+
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+  }
+
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addArithmeticShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addMovImm32ShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addMovImm64ShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addAddSubShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addLogicalVecShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addLogicalVecHalfWordShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addMoveVecShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getShifter()));
+  }
+
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
+    if (isShifter()) {
+      assert(ARM64_AM::getShiftType(getShifter()) == ARM64_AM::LSL);
+      unsigned imm = getArithExtendImm(ARM64_AM::UXTX,
+                                       ARM64_AM::getShiftValue(getShifter()));
+      Inst.addOperand(MCOperand::CreateImm(imm));
+    } else
+      Inst.addOperand(MCOperand::CreateImm(getExtend()));
+  }
+
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getExtend()));
+  }
+
+  void addExtendLSL64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
+    if (isShifter()) {
+      assert(ARM64_AM::getShiftType(getShifter()) == ARM64_AM::LSL);
+      unsigned imm = getArithExtendImm(ARM64_AM::UXTX,
+                                       ARM64_AM::getShiftValue(getShifter()));
+      Inst.addOperand(MCOperand::CreateImm(imm));
+    } else
+      Inst.addOperand(MCOperand::CreateImm(getExtend()));
+  }
+
+  void addMemoryRegisterOffsetOperands(MCInst &Inst, unsigned N, bool DoShift) {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Mem.OffsetRegNum));
+    unsigned ExtendImm = ARM64_AM::getMemExtendImm(Mem.ExtType, DoShift);
+    Inst.addOperand(MCOperand::CreateImm(ExtendImm));
+  }
+
+  void addMemoryRegisterOffset8Operands(MCInst &Inst, unsigned N) {
+    addMemoryRegisterOffsetOperands(Inst, N, Mem.ExplicitShift);
+  }
+
+  void addMemoryRegisterOffset16Operands(MCInst &Inst, unsigned N) {
+    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 1);
+  }
+
+  void addMemoryRegisterOffset32Operands(MCInst &Inst, unsigned N) {
+    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 2);
+  }
+
+  void addMemoryRegisterOffset64Operands(MCInst &Inst, unsigned N) {
+    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 3);
+  }
+
+  void addMemoryRegisterOffset128Operands(MCInst &Inst, unsigned N) {
+    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 4);
+  }
+
+  void addMemoryIndexedOperands(MCInst &Inst, unsigned N,
+                                unsigned Scale) const {
+    // Add the base register operand.
+    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
+
+    if (!Mem.OffsetImm) {
+      // There isn't an offset.
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // Add the offset operand.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm)) {
+      assert(CE->getValue() % Scale == 0 &&
+             "Offset operand must be multiple of the scale!");
+
+      // The MCInst offset operand doesn't include the low bits (like the
+      // instruction encoding).
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / Scale));
+    }
+
+    // If this is a pageoff symrefexpr with an addend, the linker will
+    // do the scaling of the addend.
+    //
+    // Otherwise we don't know what this is, so just add the scaling divide to
+    // the expression and let the MC fixup evaluation code deal with it.
+    const MCExpr *Expr = Mem.OffsetImm;
+    ARM64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    const MCConstantExpr *Addend;
+    if (Scale > 1 &&
+        (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                            Addend) ||
+         (Addend != 0 && DarwinRefKind != MCSymbolRefExpr::VK_PAGEOFF))) {
+      Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(Scale, Ctx),
+                                     Ctx);
+    }
+
+    Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addMemoryUnscaledOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemoryUnscaled() && "Invalid number of operands!");
+    // Add the base register operand.
+    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
+
+    // Add the offset operand.
+    if (!Mem.OffsetImm)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else {
+      // Only constant offsets supported.
+      const MCConstantExpr *CE = cast<MCConstantExpr>(Mem.OffsetImm);
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    }
+  }
+
+  void addMemoryIndexed128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemoryIndexed128() && "Invalid number of operands!");
+    addMemoryIndexedOperands(Inst, N, 16);
+  }
+
+  void addMemoryIndexed64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemoryIndexed64() && "Invalid number of operands!");
+    addMemoryIndexedOperands(Inst, N, 8);
+  }
+
+  void addMemoryIndexed32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemoryIndexed32() && "Invalid number of operands!");
+    addMemoryIndexedOperands(Inst, N, 4);
+  }
+
+  void addMemoryIndexed16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemoryIndexed16() && "Invalid number of operands!");
+    addMemoryIndexedOperands(Inst, N, 2);
+  }
+
+  void addMemoryIndexed8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && isMemoryIndexed8() && "Invalid number of operands!");
+    addMemoryIndexedOperands(Inst, N, 1);
+  }
+
+  void addMemoryNoIndexOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && isMemoryNoIndex() && "Invalid number of operands!");
+    // Add the base register operand (the offset is always zero, so ignore it).
+    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
+  }
+
+  void addMemorySIMDNoIndexOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && isMemorySIMDNoIndex() && "Invalid number of operands!");
+    // Add the base register operand (the offset is always zero, so ignore it).
+    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
+  }
+
+  void addMemoryWritebackIndexedOperands(MCInst &Inst, unsigned N,
+                                         unsigned Scale) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    // Add the base register operand.
+    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
+
+    // Add the offset operand.
+    int64_t Offset = 0;
+    if (Mem.OffsetImm) {
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
+      assert(CE && "Non-constant indexed offset operand!");
+      Offset = CE->getValue();
+    }
+
+    if (Scale != 1) {
+      assert(Offset % Scale == 0 &&
+             "Offset operand must be a multiple of the scale!");
+      Offset /= Scale;
+    }
+
+    Inst.addOperand(MCOperand::CreateImm(Offset));
+  }
+
+  void addMemoryIndexedSImm9Operands(MCInst &Inst, unsigned N) const {
+    addMemoryWritebackIndexedOperands(Inst, N, 1);
+  }
+
+  void addMemoryIndexed32SImm7Operands(MCInst &Inst, unsigned N) const {
+    addMemoryWritebackIndexedOperands(Inst, N, 4);
+  }
+
+  void addMemoryIndexed64SImm7Operands(MCInst &Inst, unsigned N) const {
+    addMemoryWritebackIndexedOperands(Inst, N, 8);
+  }
+
+  void addMemoryIndexed128SImm7Operands(MCInst &Inst, unsigned N) const {
+    addMemoryWritebackIndexedOperands(Inst, N, 16);
+  }
+
+  virtual void print(raw_ostream &OS) const;
+
+  static ARM64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
+                                   MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARM64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
+                                 SMLoc E, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                        unsigned NumElements, char ElementKind,
+                                        SMLoc S, SMLoc E, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                         MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
+                                 MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARM64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARM64Operand *CreateSystemRegister(uint16_t Val, SMLoc S,
+                                            MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_SystemRegister, Ctx);
+    Op->SystemRegister.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARM64Operand *CreateCPSRField(ARM64SYS::CPSRField Field, SMLoc S,
+                                       MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_CPSRField, Ctx);
+    Op->CPSRField.Field = Field;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARM64Operand *CreateMem(unsigned BaseRegNum, const MCExpr *Off,
+                                 SMLoc S, SMLoc E, SMLoc OffsetLoc,
+                                 MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
+    Op->Mem.BaseRegNum = BaseRegNum;
+    Op->Mem.OffsetRegNum = 0;
+    Op->Mem.OffsetImm = Off;
+    Op->Mem.ExtType = ARM64_AM::UXTX;
+    Op->Mem.ShiftVal = 0;
+    Op->Mem.ExplicitShift = false;
+    Op->Mem.Mode = ImmediateOffset;
+    Op->OffsetLoc = OffsetLoc;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateRegOffsetMem(unsigned BaseReg, unsigned OffsetReg,
+                                          ARM64_AM::ExtendType ExtType,
+                                          unsigned ShiftVal, bool ExplicitShift,
+                                          SMLoc S, SMLoc E, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
+    Op->Mem.BaseRegNum = BaseReg;
+    Op->Mem.OffsetRegNum = OffsetReg;
+    Op->Mem.OffsetImm = 0;
+    Op->Mem.ExtType = ExtType;
+    Op->Mem.ShiftVal = ShiftVal;
+    Op->Mem.ExplicitShift = ExplicitShift;
+    Op->Mem.Mode = RegisterOffset;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
+                                   MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARM64Operand *CreateShifter(ARM64_AM::ShiftType ShOp, unsigned Val,
+                                     SMLoc S, SMLoc E, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Shifter, Ctx);
+    Op->Shifter.Val = ARM64_AM::getShifterImm(ShOp, Val);
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARM64Operand *CreateExtend(ARM64_AM::ExtendType ExtOp, unsigned Val,
+                                    SMLoc S, SMLoc E, MCContext &Ctx) {
+    ARM64Operand *Op = new ARM64Operand(k_Extend, Ctx);
+    Op->Extend.Val = ARM64_AM::getArithExtendImm(ExtOp, Val);
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void ARM64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "(" << ARM64_AM::getFPImmFloat(getFPImm())
+       << ") >";
+    break;
+  case k_Barrier: {
+    const char *Name =
+        ARM64SYS::getBarrierOptName((ARM64SYS::BarrierOption)getBarrier());
+    OS << "<barrier ";
+    if (Name)
+      OS << Name;
+    else
+      OS << getBarrier();
+    OS << ">";
+    break;
+  }
+  case k_SystemRegister: {
+    const char *Name = ARM64SYS::getSystemRegisterName(
+        (ARM64SYS::SystemRegister)getSystemRegister());
+    OS << "<systemreg ";
+    if (Name)
+      OS << Name;
+    else
+      OS << "#" << getSystemRegister();
+    OS << ">";
+    break;
+  }
+  case k_CPSRField: {
+    const char *Name = ARM64SYS::getCPSRFieldName(getCPSRField());
+    OS << "<cpsrfield " << Name << ">";
+    break;
+  }
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_Memory:
+    OS << "<memory>";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch:
+    OS << "<prfop ";
+    if (ARM64_AM::isNamedPrefetchOp(getPrefetch()))
+      OS << ARM64_AM::getPrefetchOpName((ARM64_AM::PrefetchOp)getPrefetch());
+    else
+      OS << "#" << getPrefetch();
+    OS << ">";
+    break;
+  case k_Shifter: {
+    unsigned Val = getShifter();
+    OS << "<" << ARM64_AM::getShiftName(ARM64_AM::getShiftType(Val)) << " #"
+       << ARM64_AM::getShiftValue(Val) << ">";
+    break;
+  }
+  case k_Extend: {
+    unsigned Val = getExtend();
+    OS << "<" << ARM64_AM::getExtendName(ARM64_AM::getArithExtendType(Val))
+       << " #" << ARM64_AM::getArithShiftValue(Val) << ">";
+    break;
+  }
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+      .Case("v0", ARM64::Q0)
+      .Case("v1", ARM64::Q1)
+      .Case("v2", ARM64::Q2)
+      .Case("v3", ARM64::Q3)
+      .Case("v4", ARM64::Q4)
+      .Case("v5", ARM64::Q5)
+      .Case("v6", ARM64::Q6)
+      .Case("v7", ARM64::Q7)
+      .Case("v8", ARM64::Q8)
+      .Case("v9", ARM64::Q9)
+      .Case("v10", ARM64::Q10)
+      .Case("v11", ARM64::Q11)
+      .Case("v12", ARM64::Q12)
+      .Case("v13", ARM64::Q13)
+      .Case("v14", ARM64::Q14)
+      .Case("v15", ARM64::Q15)
+      .Case("v16", ARM64::Q16)
+      .Case("v17", ARM64::Q17)
+      .Case("v18", ARM64::Q18)
+      .Case("v19", ARM64::Q19)
+      .Case("v20", ARM64::Q20)
+      .Case("v21", ARM64::Q21)
+      .Case("v22", ARM64::Q22)
+      .Case("v23", ARM64::Q23)
+      .Case("v24", ARM64::Q24)
+      .Case("v25", ARM64::Q25)
+      .Case("v26", ARM64::Q26)
+      .Case("v27", ARM64::Q27)
+      .Case("v28", ARM64::Q28)
+      .Case("v29", ARM64::Q29)
+      .Case("v30", ARM64::Q30)
+      .Case("v31", ARM64::Q31)
+      .Default(0);
+}
+
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Default(false);
+}
+
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
+
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
+
+  if (Name.size() == 2)
+    return;
+
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
+  }
+}
+
+bool ARM64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                   SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int ARM64AsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("x29", ARM64::FP)
+                 .Case("x30", ARM64::LR)
+                 .Case("x31", ARM64::XZR)
+                 .Case("w31", ARM64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int ARM64AsmParser::tryMatchVectorRegister(StringRef &Kind) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchVectorRegName(Head);
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
+  }
+  return -1;
+}
+
+static int MatchSysCRName(StringRef Name) {
+  // Use the same layout as the tablegen'erated register name matcher. Ugly,
+  // but efficient.
+  switch (Name.size()) {
+  default:
+    break;
+  case 2:
+    if (Name[0] != 'c' && Name[0] != 'C')
+      return -1;
+    switch (Name[1]) {
+    default:
+      return -1;
+    case '0':
+      return 0;
+    case '1':
+      return 1;
+    case '2':
+      return 2;
+    case '3':
+      return 3;
+    case '4':
+      return 4;
+    case '5':
+      return 5;
+    case '6':
+      return 6;
+    case '7':
+      return 7;
+    case '8':
+      return 8;
+    case '9':
+      return 9;
+    }
+    break;
+  case 3:
+    if ((Name[0] != 'c' && Name[0] != 'C') || Name[1] != '1')
+      return -1;
+    switch (Name[2]) {
+    default:
+      return -1;
+    case '0':
+      return 10;
+    case '1':
+      return 11;
+    case '2':
+      return 12;
+    case '3':
+      return 13;
+    case '4':
+      return 14;
+    case '5':
+      return 15;
+    }
+    break;
+  }
+
+  llvm_unreachable("Unhandled SysCR operand string!");
+  return -1;
+}
+
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Num = MatchSysCRName(Tok.getString());
+  if (Num == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARM64Operand::CreateSysCR(Num, S, getLoc(), getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParsePrefetch - Try to parse a prefetch operand.
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  if (Tok.is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  unsigned prfop = StringSwitch<unsigned>(Tok.getString())
+                       .Case("pldl1keep", ARM64_AM::PLDL1KEEP)
+                       .Case("pldl1strm", ARM64_AM::PLDL1STRM)
+                       .Case("pldl2keep", ARM64_AM::PLDL2KEEP)
+                       .Case("pldl2strm", ARM64_AM::PLDL2STRM)
+                       .Case("pldl3keep", ARM64_AM::PLDL3KEEP)
+                       .Case("pldl3strm", ARM64_AM::PLDL3STRM)
+                       .Case("pstl1keep", ARM64_AM::PSTL1KEEP)
+                       .Case("pstl1strm", ARM64_AM::PSTL1STRM)
+                       .Case("pstl2keep", ARM64_AM::PSTL2KEEP)
+                       .Case("pstl2strm", ARM64_AM::PSTL2STRM)
+                       .Case("pstl3keep", ARM64_AM::PSTL3KEEP)
+                       .Case("pstl3strm", ARM64_AM::PSTL3STRM)
+                       .Default(0xff);
+  if (prfop == 0xff) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+  if (parseSymbolicImmVal(Expr))
+    return MatchOperand_ParseFail;
+
+  ARM64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  const MCConstantExpr *Addend;
+  if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    Error(S, "modified label reference + constant expected");
+    return MatchOperand_ParseFail;
+  }
+
+  if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+      ELFRefKind == ARM64MCExpr::VK_INVALID) {
+    // No modifier was specified at all; this is the syntax for an ELF basic
+    // ADRP relocation (unfortunately).
+    Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_ABS_PAGE, getContext());
+  } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+              DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+             Addend != 0) {
+    Error(S, "gotpage label reference not allowed an addend");
+    return MatchOperand_ParseFail;
+  } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+             DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+             DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+             ELFRefKind != ARM64MCExpr::VK_GOT_PAGE &&
+             ELFRefKind != ARM64MCExpr::VK_GOTTPREL_PAGE &&
+             ELFRefKind != ARM64MCExpr::VK_TLSDESC_PAGE) {
+    // The operand must be an @page or @gotpage qualified symbolref.
+    Error(S, "page or gotpage label reference expected");
+    return MatchOperand_ParseFail;
+  }
+
+  // We have a label reference possibly with addend. The addend is a raw value
+  // here. The linker will adjust it to only reference the page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  // The operand must be an un-qualified assembler local symbolref.
+  // FIXME: wrong for ELF.
+  if (const MCSymbolRefExpr *SRE = dyn_cast<const MCSymbolRefExpr>(Expr)) {
+    // FIXME: Should reference the MachineAsmInfo to get the private prefix.
+    bool isTemporary = SRE->getSymbol().getName().startswith("L");
+    if (!isTemporary || SRE->getKind() != MCSymbolRefExpr::VK_None) {
+      Error(S, "unqualified, assembler-local label name expected");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseFPImm - A floating point immediate expression operand.
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Hash))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '#'.
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isZero()) {
+      TokError("floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
+    }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// parseCondCodeString - Parse a Condition Code string.
+unsigned ARM64AsmParser::parseCondCodeString(StringRef Cond) {
+  unsigned CC = StringSwitch<unsigned>(Cond)
+                    .Case("eq", ARM64CC::EQ)
+                    .Case("ne", ARM64CC::NE)
+                    .Case("cs", ARM64CC::CS)
+                    .Case("hs", ARM64CC::CS)
+                    .Case("cc", ARM64CC::CC)
+                    .Case("lo", ARM64CC::CC)
+                    .Case("mi", ARM64CC::MI)
+                    .Case("pl", ARM64CC::PL)
+                    .Case("vs", ARM64CC::VS)
+                    .Case("vc", ARM64CC::VC)
+                    .Case("hi", ARM64CC::HI)
+                    .Case("ls", ARM64CC::LS)
+                    .Case("ge", ARM64CC::GE)
+                    .Case("lt", ARM64CC::LT)
+                    .Case("gt", ARM64CC::GT)
+                    .Case("le", ARM64CC::LE)
+                    .Case("al", ARM64CC::AL)
+                // Upper case works too. Not mixed case, though.
+                    .Case("EQ", ARM64CC::EQ)
+                    .Case("NE", ARM64CC::NE)
+                    .Case("CS", ARM64CC::CS)
+                    .Case("HS", ARM64CC::CS)
+                    .Case("CC", ARM64CC::CC)
+                    .Case("LO", ARM64CC::CC)
+                    .Case("MI", ARM64CC::MI)
+                    .Case("PL", ARM64CC::PL)
+                    .Case("VS", ARM64CC::VS)
+                    .Case("VC", ARM64CC::VC)
+                    .Case("HI", ARM64CC::HI)
+                    .Case("LS", ARM64CC::LS)
+                    .Case("GE", ARM64CC::GE)
+                    .Case("LT", ARM64CC::LT)
+                    .Case("GT", ARM64CC::GT)
+                    .Case("LE", ARM64CC::LE)
+                    .Case("AL", ARM64CC::AL)
+                    .Default(~0U);
+  return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool ARM64AsmParser::parseCondCode(OperandVector &Operands,
+                                   bool invertCondCode) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  StringRef Cond = Tok.getString();
+  unsigned CC = parseCondCodeString(Cond);
+  if (CC == ~0U)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
+
+  if (invertCondCode)
+    CC = ARM64CC::getInvertedCondCode(ARM64CC::CondCode(CC));
+
+  const MCExpr *CCExpr = MCConstantExpr::Create(CC, getContext());
+  Operands.push_back(
+      ARM64Operand::CreateImm(CCExpr, S, getLoc(), getContext()));
+  return false;
+}
+
+/// ParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
+bool ARM64AsmParser::parseOptionalShift(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  ARM64_AM::ShiftType ShOp = StringSwitch<ARM64_AM::ShiftType>(Tok.getString())
+                                 .Case("lsl", ARM64_AM::LSL)
+                                 .Case("lsr", ARM64_AM::LSR)
+                                 .Case("asr", ARM64_AM::ASR)
+                                 .Case("ror", ARM64_AM::ROR)
+                                 .Case("msl", ARM64_AM::MSL)
+                                 .Case("LSL", ARM64_AM::LSL)
+                                 .Case("LSR", ARM64_AM::LSR)
+                                 .Case("ASR", ARM64_AM::ASR)
+                                 .Case("ROR", ARM64_AM::ROR)
+                                 .Case("MSL", ARM64_AM::MSL)
+                                 .Default(ARM64_AM::InvalidShift);
+  if (ShOp == ARM64_AM::InvalidShift)
+    return true;
+
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
+
+  // We expect a number here.
+  if (getLexer().isNot(AsmToken::Hash))
+    return TokError("immediate value expected for shifter operand");
+  Parser.Lex(); // Eat the '#'.
+
+  SMLoc ExprLoc = getLoc();
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE)
+    return TokError("immediate value expected for shifter operand");
+
+  if ((MCE->getValue() & 0x3f) != MCE->getValue())
+    return Error(ExprLoc, "immediate value too large for shifter operand");
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(
+      ARM64Operand::CreateShifter(ShOp, MCE->getValue(), S, E, getContext()));
+  return false;
+}
+
+/// parseOptionalExtend - Some operands take an optional extend argument. Parse
+/// them if present.
+bool ARM64AsmParser::parseOptionalExtend(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  ARM64_AM::ExtendType ExtOp =
+      StringSwitch<ARM64_AM::ExtendType>(Tok.getString())
+          .Case("uxtb", ARM64_AM::UXTB)
+          .Case("uxth", ARM64_AM::UXTH)
+          .Case("uxtw", ARM64_AM::UXTW)
+          .Case("uxtx", ARM64_AM::UXTX)
+          .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
+          .Case("sxtb", ARM64_AM::SXTB)
+          .Case("sxth", ARM64_AM::SXTH)
+          .Case("sxtw", ARM64_AM::SXTW)
+          .Case("sxtx", ARM64_AM::SXTX)
+          .Case("UXTB", ARM64_AM::UXTB)
+          .Case("UXTH", ARM64_AM::UXTH)
+          .Case("UXTW", ARM64_AM::UXTW)
+          .Case("UXTX", ARM64_AM::UXTX)
+          .Case("LSL", ARM64_AM::UXTX) // Alias for UXTX
+          .Case("SXTB", ARM64_AM::SXTB)
+          .Case("SXTH", ARM64_AM::SXTH)
+          .Case("SXTW", ARM64_AM::SXTW)
+          .Case("SXTX", ARM64_AM::SXTX)
+          .Default(ARM64_AM::InvalidExtend);
+  if (ExtOp == ARM64_AM::InvalidExtend)
+    return true;
+
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      getLexer().is(AsmToken::Comma)) {
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        ARM64Operand::CreateExtend(ExtOp, 0, S, E, getContext()));
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Hash)) {
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        ARM64Operand::CreateExtend(ExtOp, 0, S, E, getContext()));
+    return false;
+  }
+
+  Parser.Lex(); // Eat the '#'.
+
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE)
+    return TokError("immediate value expected for extend operand");
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(
+      ARM64Operand::CreateExtend(ExtOp, MCE->getValue(), S, E, getContext()));
+  return false;
+}
+
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool ARM64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
+
+  Mnemonic = Name;
+  Operands.push_back(
+      ARM64Operand::CreateToken("sys", false, NameLoc, getContext()));
+
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
+  SMLoc S = Tok.getLoc();
+
+  const MCExpr *Expr = 0;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
+    Operands.push_back(                                                        \
+        ARM64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));             \
+    Operands.push_back(                                                        \
+        ARM64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));             \
+    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
+    }
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
+    }
+  }
+
+#undef SYS_ALIAS
+
+  Parser.Lex(); // Eat operand.
+
+  // Check for the optional register operand.
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma.
+
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // Can be either a #imm style literal or an option name
+  if (Tok.is(AsmToken::Hash)) {
+    // Immediate operand.
+    Parser.Lex(); // Eat the '#'
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
+      return MatchOperand_ParseFail;
+    }
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(
+        ARM64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  unsigned Opt = StringSwitch<unsigned>(Tok.getString())
+                     .Case("oshld", ARM64SYS::OSHLD)
+                     .Case("oshst", ARM64SYS::OSHST)
+                     .Case("osh", ARM64SYS::OSH)
+                     .Case("nshld", ARM64SYS::NSHLD)
+                     .Case("nshst", ARM64SYS::NSHST)
+                     .Case("nsh", ARM64SYS::NSH)
+                     .Case("ishld", ARM64SYS::ISHLD)
+                     .Case("ishst", ARM64SYS::ISHST)
+                     .Case("ish", ARM64SYS::ISH)
+                     .Case("ld", ARM64SYS::LD)
+                     .Case("st", ARM64SYS::ST)
+                     .Case("sy", ARM64SYS::SY)
+                     .Default(ARM64SYS::InvalidBarrier);
+  if (Opt == ARM64SYS::InvalidBarrier) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
+
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && Opt != ARM64SYS::SY) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARM64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
+
+  return MatchOperand_Success;
+}
+
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseSystemRegister(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // It can be specified as a symbolic name.
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  auto ID = Tok.getString().lower();
+  ARM64SYS::SystemRegister Reg =
+      StringSwitch<ARM64SYS::SystemRegister>(ID)
+          .Case("spsr_el1", ARM64SYS::SPSR_svc)
+          .Case("spsr_svc", ARM64SYS::SPSR_svc)
+          .Case("elr_el1", ARM64SYS::ELR_EL1)
+          .Case("sp_el0", ARM64SYS::SP_EL0)
+          .Case("spsel", ARM64SYS::SPSel)
+          .Case("daif", ARM64SYS::DAIF)
+          .Case("currentel", ARM64SYS::CurrentEL)
+          .Case("nzcv", ARM64SYS::NZCV)
+          .Case("fpcr", ARM64SYS::FPCR)
+          .Case("fpsr", ARM64SYS::FPSR)
+          .Case("dspsr", ARM64SYS::DSPSR)
+          .Case("dlr", ARM64SYS::DLR)
+          .Case("spsr_el2", ARM64SYS::SPSR_hyp)
+          .Case("spsr_hyp", ARM64SYS::SPSR_hyp)
+          .Case("elr_el2", ARM64SYS::ELR_EL2)
+          .Case("sp_el1", ARM64SYS::SP_EL1)
+          .Case("spsr_irq", ARM64SYS::SPSR_irq)
+          .Case("spsr_abt", ARM64SYS::SPSR_abt)
+          .Case("spsr_und", ARM64SYS::SPSR_und)
+          .Case("spsr_fiq", ARM64SYS::SPSR_fiq)
+          .Case("spsr_el3", ARM64SYS::SPSR_EL3)
+          .Case("elr_el3", ARM64SYS::ELR_EL3)
+          .Case("sp_el2", ARM64SYS::SP_EL2)
+          .Case("midr_el1", ARM64SYS::MIDR_EL1)
+          .Case("ctr_el0", ARM64SYS::CTR_EL0)
+          .Case("mpidr_el1", ARM64SYS::MPIDR_EL1)
+          .Case("ecoidr_el1", ARM64SYS::ECOIDR_EL1)
+          .Case("dczid_el0", ARM64SYS::DCZID_EL0)
+          .Case("mvfr0_el1", ARM64SYS::MVFR0_EL1)
+          .Case("mvfr1_el1", ARM64SYS::MVFR1_EL1)
+          .Case("id_aa64pfr0_el1", ARM64SYS::ID_AA64PFR0_EL1)
+          .Case("id_aa64pfr1_el1", ARM64SYS::ID_AA64PFR1_EL1)
+          .Case("id_aa64dfr0_el1", ARM64SYS::ID_AA64DFR0_EL1)
+          .Case("id_aa64dfr1_el1", ARM64SYS::ID_AA64DFR1_EL1)
+          .Case("id_aa64isar0_el1", ARM64SYS::ID_AA64ISAR0_EL1)
+          .Case("id_aa64isar1_el1", ARM64SYS::ID_AA64ISAR1_EL1)
+          .Case("id_aa64mmfr0_el1", ARM64SYS::ID_AA64MMFR0_EL1)
+          .Case("id_aa64mmfr1_el1", ARM64SYS::ID_AA64MMFR1_EL1)
+          .Case("ccsidr_el1", ARM64SYS::CCSIDR_EL1)
+          .Case("clidr_el1", ARM64SYS::CLIDR_EL1)
+          .Case("aidr_el1", ARM64SYS::AIDR_EL1)
+          .Case("csselr_el1", ARM64SYS::CSSELR_EL1)
+          .Case("vpidr_el2", ARM64SYS::VPIDR_EL2)
+          .Case("vmpidr_el2", ARM64SYS::VMPIDR_EL2)
+          .Case("sctlr_el1", ARM64SYS::SCTLR_EL1)
+          .Case("sctlr_el2", ARM64SYS::SCTLR_EL2)
+          .Case("sctlr_el3", ARM64SYS::SCTLR_EL3)
+          .Case("actlr_el1", ARM64SYS::ACTLR_EL1)
+          .Case("actlr_el2", ARM64SYS::ACTLR_EL2)
+          .Case("actlr_el3", ARM64SYS::ACTLR_EL3)
+          .Case("cpacr_el1", ARM64SYS::CPACR_EL1)
+          .Case("cptr_el2", ARM64SYS::CPTR_EL2)
+          .Case("cptr_el3", ARM64SYS::CPTR_EL3)
+          .Case("scr_el3", ARM64SYS::SCR_EL3)
+          .Case("hcr_el2", ARM64SYS::HCR_EL2)
+          .Case("mdcr_el2", ARM64SYS::MDCR_EL2)
+          .Case("mdcr_el3", ARM64SYS::MDCR_EL3)
+          .Case("hstr_el2", ARM64SYS::HSTR_EL2)
+          .Case("hacr_el2", ARM64SYS::HACR_EL2)
+          .Case("ttbr0_el1", ARM64SYS::TTBR0_EL1)
+          .Case("ttbr1_el1", ARM64SYS::TTBR1_EL1)
+          .Case("ttbr0_el2", ARM64SYS::TTBR0_EL2)
+          .Case("ttbr0_el3", ARM64SYS::TTBR0_EL3)
+          .Case("vttbr_el2", ARM64SYS::VTTBR_EL2)
+          .Case("tcr_el1", ARM64SYS::TCR_EL1)
+          .Case("tcr_el2", ARM64SYS::TCR_EL2)
+          .Case("tcr_el3", ARM64SYS::TCR_EL3)
+          .Case("vtcr_el2", ARM64SYS::VTCR_EL2)
+          .Case("adfsr_el1", ARM64SYS::ADFSR_EL1)
+          .Case("aifsr_el1", ARM64SYS::AIFSR_EL1)
+          .Case("adfsr_el2", ARM64SYS::ADFSR_EL2)
+          .Case("aifsr_el2", ARM64SYS::AIFSR_EL2)
+          .Case("adfsr_el3", ARM64SYS::ADFSR_EL3)
+          .Case("aifsr_el3", ARM64SYS::AIFSR_EL3)
+          .Case("esr_el1", ARM64SYS::ESR_EL1)
+          .Case("esr_el2", ARM64SYS::ESR_EL2)
+          .Case("esr_el3", ARM64SYS::ESR_EL3)
+          .Case("far_el1", ARM64SYS::FAR_EL1)
+          .Case("far_el2", ARM64SYS::FAR_EL2)
+          .Case("far_el3", ARM64SYS::FAR_EL3)
+          .Case("hpfar_el2", ARM64SYS::HPFAR_EL2)
+          .Case("par_el1", ARM64SYS::PAR_EL1)
+          .Case("mair_el1", ARM64SYS::MAIR_EL1)
+          .Case("mair_el2", ARM64SYS::MAIR_EL2)
+          .Case("mair_el3", ARM64SYS::MAIR_EL3)
+          .Case("amair_el1", ARM64SYS::AMAIR_EL1)
+          .Case("amair_el2", ARM64SYS::AMAIR_EL2)
+          .Case("amair_el3", ARM64SYS::AMAIR_EL3)
+          .Case("vbar_el1", ARM64SYS::VBAR_EL1)
+          .Case("vbar_el2", ARM64SYS::VBAR_EL2)
+          .Case("vbar_el3", ARM64SYS::VBAR_EL3)
+          .Case("rvbar_el1", ARM64SYS::RVBAR_EL1)
+          .Case("rvbar_el2", ARM64SYS::RVBAR_EL2)
+          .Case("rvbar_el3", ARM64SYS::RVBAR_EL3)
+          .Case("isr_el1", ARM64SYS::ISR_EL1)
+          .Case("contextidr_el1", ARM64SYS::CONTEXTIDR_EL1)
+          .Case("tpidr_el0", ARM64SYS::TPIDR_EL0)
+          .Case("tpidrro_el0", ARM64SYS::TPIDRRO_EL0)
+          .Case("tpidr_el1", ARM64SYS::TPIDR_EL1)
+          .Case("tpidr_el2", ARM64SYS::TPIDR_EL2)
+          .Case("tpidr_el3", ARM64SYS::TPIDR_EL3)
+          .Case("teecr32_el1", ARM64SYS::TEECR32_EL1)
+          .Case("cntfrq_el0", ARM64SYS::CNTFRQ_EL0)
+          .Case("cntpct_el0", ARM64SYS::CNTPCT_EL0)
+          .Case("cntvct_el0", ARM64SYS::CNTVCT_EL0)
+          .Case("cntvoff_el2", ARM64SYS::CNTVOFF_EL2)
+          .Case("cntkctl_el1", ARM64SYS::CNTKCTL_EL1)
+          .Case("cnthctl_el2", ARM64SYS::CNTHCTL_EL2)
+          .Case("cntp_tval_el0", ARM64SYS::CNTP_TVAL_EL0)
+          .Case("cntp_ctl_el0", ARM64SYS::CNTP_CTL_EL0)
+          .Case("cntp_cval_el0", ARM64SYS::CNTP_CVAL_EL0)
+          .Case("cntv_tval_el0", ARM64SYS::CNTV_TVAL_EL0)
+          .Case("cntv_ctl_el0", ARM64SYS::CNTV_CTL_EL0)
+          .Case("cntv_cval_el0", ARM64SYS::CNTV_CVAL_EL0)
+          .Case("cnthp_tval_el2", ARM64SYS::CNTHP_TVAL_EL2)
+          .Case("cnthp_ctl_el2", ARM64SYS::CNTHP_CTL_EL2)
+          .Case("cnthp_cval_el2", ARM64SYS::CNTHP_CVAL_EL2)
+          .Case("cntps_tval_el1", ARM64SYS::CNTPS_TVAL_EL1)
+          .Case("cntps_ctl_el1", ARM64SYS::CNTPS_CTL_EL1)
+          .Case("cntps_cval_el1", ARM64SYS::CNTPS_CVAL_EL1)
+          .Case("dacr32_el2", ARM64SYS::DACR32_EL2)
+          .Case("ifsr32_el2", ARM64SYS::IFSR32_EL2)
+          .Case("teehbr32_el1", ARM64SYS::TEEHBR32_EL1)
+          .Case("sder32_el3", ARM64SYS::SDER32_EL3)
+          .Case("fpexc32_el2", ARM64SYS::FPEXC32_EL2)
+          .Case("current_el", ARM64SYS::CurrentEL)
+          .Case("pmevcntr0_el0", ARM64SYS::PMEVCNTR0_EL0)
+          .Case("pmevcntr1_el0", ARM64SYS::PMEVCNTR1_EL0)
+          .Case("pmevcntr2_el0", ARM64SYS::PMEVCNTR2_EL0)
+          .Case("pmevcntr3_el0", ARM64SYS::PMEVCNTR3_EL0)
+          .Case("pmevcntr4_el0", ARM64SYS::PMEVCNTR4_EL0)
+          .Case("pmevcntr5_el0", ARM64SYS::PMEVCNTR5_EL0)
+          .Case("pmevcntr6_el0", ARM64SYS::PMEVCNTR6_EL0)
+          .Case("pmevcntr7_el0", ARM64SYS::PMEVCNTR7_EL0)
+          .Case("pmevcntr8_el0", ARM64SYS::PMEVCNTR8_EL0)
+          .Case("pmevcntr9_el0", ARM64SYS::PMEVCNTR9_EL0)
+          .Case("pmevcntr10_el0", ARM64SYS::PMEVCNTR10_EL0)
+          .Case("pmevcntr11_el0", ARM64SYS::PMEVCNTR11_EL0)
+          .Case("pmevcntr12_el0", ARM64SYS::PMEVCNTR12_EL0)
+          .Case("pmevcntr13_el0", ARM64SYS::PMEVCNTR13_EL0)
+          .Case("pmevcntr14_el0", ARM64SYS::PMEVCNTR14_EL0)
+          .Case("pmevcntr15_el0", ARM64SYS::PMEVCNTR15_EL0)
+          .Case("pmevcntr16_el0", ARM64SYS::PMEVCNTR16_EL0)
+          .Case("pmevcntr17_el0", ARM64SYS::PMEVCNTR17_EL0)
+          .Case("pmevcntr18_el0", ARM64SYS::PMEVCNTR18_EL0)
+          .Case("pmevcntr19_el0", ARM64SYS::PMEVCNTR19_EL0)
+          .Case("pmevcntr20_el0", ARM64SYS::PMEVCNTR20_EL0)
+          .Case("pmevcntr21_el0", ARM64SYS::PMEVCNTR21_EL0)
+          .Case("pmevcntr22_el0", ARM64SYS::PMEVCNTR22_EL0)
+          .Case("pmevcntr23_el0", ARM64SYS::PMEVCNTR23_EL0)
+          .Case("pmevcntr24_el0", ARM64SYS::PMEVCNTR24_EL0)
+          .Case("pmevcntr25_el0", ARM64SYS::PMEVCNTR25_EL0)
+          .Case("pmevcntr26_el0", ARM64SYS::PMEVCNTR26_EL0)
+          .Case("pmevcntr27_el0", ARM64SYS::PMEVCNTR27_EL0)
+          .Case("pmevcntr28_el0", ARM64SYS::PMEVCNTR28_EL0)
+          .Case("pmevcntr29_el0", ARM64SYS::PMEVCNTR29_EL0)
+          .Case("pmevcntr30_el0", ARM64SYS::PMEVCNTR30_EL0)
+          .Case("pmevtyper0_el0", ARM64SYS::PMEVTYPER0_EL0)
+          .Case("pmevtyper1_el0", ARM64SYS::PMEVTYPER1_EL0)
+          .Case("pmevtyper2_el0", ARM64SYS::PMEVTYPER2_EL0)
+          .Case("pmevtyper3_el0", ARM64SYS::PMEVTYPER3_EL0)
+          .Case("pmevtyper4_el0", ARM64SYS::PMEVTYPER4_EL0)
+          .Case("pmevtyper5_el0", ARM64SYS::PMEVTYPER5_EL0)
+          .Case("pmevtyper6_el0", ARM64SYS::PMEVTYPER6_EL0)
+          .Case("pmevtyper7_el0", ARM64SYS::PMEVTYPER7_EL0)
+          .Case("pmevtyper8_el0", ARM64SYS::PMEVTYPER8_EL0)
+          .Case("pmevtyper9_el0", ARM64SYS::PMEVTYPER9_EL0)
+          .Case("pmevtyper10_el0", ARM64SYS::PMEVTYPER10_EL0)
+          .Case("pmevtyper11_el0", ARM64SYS::PMEVTYPER11_EL0)
+          .Case("pmevtyper12_el0", ARM64SYS::PMEVTYPER12_EL0)
+          .Case("pmevtyper13_el0", ARM64SYS::PMEVTYPER13_EL0)
+          .Case("pmevtyper14_el0", ARM64SYS::PMEVTYPER14_EL0)
+          .Case("pmevtyper15_el0", ARM64SYS::PMEVTYPER15_EL0)
+          .Case("pmevtyper16_el0", ARM64SYS::PMEVTYPER16_EL0)
+          .Case("pmevtyper17_el0", ARM64SYS::PMEVTYPER17_EL0)
+          .Case("pmevtyper18_el0", ARM64SYS::PMEVTYPER18_EL0)
+          .Case("pmevtyper19_el0", ARM64SYS::PMEVTYPER19_EL0)
+          .Case("pmevtyper20_el0", ARM64SYS::PMEVTYPER20_EL0)
+          .Case("pmevtyper21_el0", ARM64SYS::PMEVTYPER21_EL0)
+          .Case("pmevtyper22_el0", ARM64SYS::PMEVTYPER22_EL0)
+          .Case("pmevtyper23_el0", ARM64SYS::PMEVTYPER23_EL0)
+          .Case("pmevtyper24_el0", ARM64SYS::PMEVTYPER24_EL0)
+          .Case("pmevtyper25_el0", ARM64SYS::PMEVTYPER25_EL0)
+          .Case("pmevtyper26_el0", ARM64SYS::PMEVTYPER26_EL0)
+          .Case("pmevtyper27_el0", ARM64SYS::PMEVTYPER27_EL0)
+          .Case("pmevtyper28_el0", ARM64SYS::PMEVTYPER28_EL0)
+          .Case("pmevtyper29_el0", ARM64SYS::PMEVTYPER29_EL0)
+          .Case("pmevtyper30_el0", ARM64SYS::PMEVTYPER30_EL0)
+          .Case("pmccfiltr_el0", ARM64SYS::PMCCFILTR_EL0)
+          .Case("rmr_el3", ARM64SYS::RMR_EL3)
+          .Case("rmr_el2", ARM64SYS::RMR_EL2)
+          .Case("rmr_el1", ARM64SYS::RMR_EL1)
+          .Case("cpm_ioacc_ctl_el3", ARM64SYS::CPM_IOACC_CTL_EL3)
+          .Case("mdccsr_el0", ARM64SYS::MDCCSR_EL0)
+          .Case("mdccint_el1", ARM64SYS::MDCCINT_EL1)
+          .Case("dbgdtr_el0", ARM64SYS::DBGDTR_EL0)
+          .Case("dbgdtrrx_el0", ARM64SYS::DBGDTRRX_EL0)
+          .Case("dbgdtrtx_el0", ARM64SYS::DBGDTRTX_EL0)
+          .Case("dbgvcr32_el2", ARM64SYS::DBGVCR32_EL2)
+          .Case("osdtrrx_el1", ARM64SYS::OSDTRRX_EL1)
+          .Case("mdscr_el1", ARM64SYS::MDSCR_EL1)
+          .Case("osdtrtx_el1", ARM64SYS::OSDTRTX_EL1)
+          .Case("oseccr_el11", ARM64SYS::OSECCR_EL11)
+          .Case("dbgbvr0_el1", ARM64SYS::DBGBVR0_EL1)
+          .Case("dbgbvr1_el1", ARM64SYS::DBGBVR1_EL1)
+          .Case("dbgbvr2_el1", ARM64SYS::DBGBVR2_EL1)
+          .Case("dbgbvr3_el1", ARM64SYS::DBGBVR3_EL1)
+          .Case("dbgbvr4_el1", ARM64SYS::DBGBVR4_EL1)
+          .Case("dbgbvr5_el1", ARM64SYS::DBGBVR5_EL1)
+          .Case("dbgbvr6_el1", ARM64SYS::DBGBVR6_EL1)
+          .Case("dbgbvr7_el1", ARM64SYS::DBGBVR7_EL1)
+          .Case("dbgbvr8_el1", ARM64SYS::DBGBVR8_EL1)
+          .Case("dbgbvr9_el1", ARM64SYS::DBGBVR9_EL1)
+          .Case("dbgbvr10_el1", ARM64SYS::DBGBVR10_EL1)
+          .Case("dbgbvr11_el1", ARM64SYS::DBGBVR11_EL1)
+          .Case("dbgbvr12_el1", ARM64SYS::DBGBVR12_EL1)
+          .Case("dbgbvr13_el1", ARM64SYS::DBGBVR13_EL1)
+          .Case("dbgbvr14_el1", ARM64SYS::DBGBVR14_EL1)
+          .Case("dbgbvr15_el1", ARM64SYS::DBGBVR15_EL1)
+          .Case("dbgbcr0_el1", ARM64SYS::DBGBCR0_EL1)
+          .Case("dbgbcr1_el1", ARM64SYS::DBGBCR1_EL1)
+          .Case("dbgbcr2_el1", ARM64SYS::DBGBCR2_EL1)
+          .Case("dbgbcr3_el1", ARM64SYS::DBGBCR3_EL1)
+          .Case("dbgbcr4_el1", ARM64SYS::DBGBCR4_EL1)
+          .Case("dbgbcr5_el1", ARM64SYS::DBGBCR5_EL1)
+          .Case("dbgbcr6_el1", ARM64SYS::DBGBCR6_EL1)
+          .Case("dbgbcr7_el1", ARM64SYS::DBGBCR7_EL1)
+          .Case("dbgbcr8_el1", ARM64SYS::DBGBCR8_EL1)
+          .Case("dbgbcr9_el1", ARM64SYS::DBGBCR9_EL1)
+          .Case("dbgbcr10_el1", ARM64SYS::DBGBCR10_EL1)
+          .Case("dbgbcr11_el1", ARM64SYS::DBGBCR11_EL1)
+          .Case("dbgbcr12_el1", ARM64SYS::DBGBCR12_EL1)
+          .Case("dbgbcr13_el1", ARM64SYS::DBGBCR13_EL1)
+          .Case("dbgbcr14_el1", ARM64SYS::DBGBCR14_EL1)
+          .Case("dbgbcr15_el1", ARM64SYS::DBGBCR15_EL1)
+          .Case("dbgwvr0_el1", ARM64SYS::DBGWVR0_EL1)
+          .Case("dbgwvr1_el1", ARM64SYS::DBGWVR1_EL1)
+          .Case("dbgwvr2_el1", ARM64SYS::DBGWVR2_EL1)
+          .Case("dbgwvr3_el1", ARM64SYS::DBGWVR3_EL1)
+          .Case("dbgwvr4_el1", ARM64SYS::DBGWVR4_EL1)
+          .Case("dbgwvr5_el1", ARM64SYS::DBGWVR5_EL1)
+          .Case("dbgwvr6_el1", ARM64SYS::DBGWVR6_EL1)
+          .Case("dbgwvr7_el1", ARM64SYS::DBGWVR7_EL1)
+          .Case("dbgwvr8_el1", ARM64SYS::DBGWVR8_EL1)
+          .Case("dbgwvr9_el1", ARM64SYS::DBGWVR9_EL1)
+          .Case("dbgwvr10_el1", ARM64SYS::DBGWVR10_EL1)
+          .Case("dbgwvr11_el1", ARM64SYS::DBGWVR11_EL1)
+          .Case("dbgwvr12_el1", ARM64SYS::DBGWVR12_EL1)
+          .Case("dbgwvr13_el1", ARM64SYS::DBGWVR13_EL1)
+          .Case("dbgwvr14_el1", ARM64SYS::DBGWVR14_EL1)
+          .Case("dbgwvr15_el1", ARM64SYS::DBGWVR15_EL1)
+          .Case("dbgwcr0_el1", ARM64SYS::DBGWCR0_EL1)
+          .Case("dbgwcr1_el1", ARM64SYS::DBGWCR1_EL1)
+          .Case("dbgwcr2_el1", ARM64SYS::DBGWCR2_EL1)
+          .Case("dbgwcr3_el1", ARM64SYS::DBGWCR3_EL1)
+          .Case("dbgwcr4_el1", ARM64SYS::DBGWCR4_EL1)
+          .Case("dbgwcr5_el1", ARM64SYS::DBGWCR5_EL1)
+          .Case("dbgwcr6_el1", ARM64SYS::DBGWCR6_EL1)
+          .Case("dbgwcr7_el1", ARM64SYS::DBGWCR7_EL1)
+          .Case("dbgwcr8_el1", ARM64SYS::DBGWCR8_EL1)
+          .Case("dbgwcr9_el1", ARM64SYS::DBGWCR9_EL1)
+          .Case("dbgwcr10_el1", ARM64SYS::DBGWCR10_EL1)
+          .Case("dbgwcr11_el1", ARM64SYS::DBGWCR11_EL1)
+          .Case("dbgwcr12_el1", ARM64SYS::DBGWCR12_EL1)
+          .Case("dbgwcr13_el1", ARM64SYS::DBGWCR13_EL1)
+          .Case("dbgwcr14_el1", ARM64SYS::DBGWCR14_EL1)
+          .Case("dbgwcr15_el1", ARM64SYS::DBGWCR15_EL1)
+          .Case("mdrar_el1", ARM64SYS::MDRAR_EL1)
+          .Case("oslar_el1", ARM64SYS::OSLAR_EL1)
+          .Case("oslsr_el1", ARM64SYS::OSLSR_EL1)
+          .Case("osdlr_el1", ARM64SYS::OSDLR_EL1)
+          .Case("dbgprcr_el1", ARM64SYS::DBGPRCR_EL1)
+          .Case("dbgclaimset_el1", ARM64SYS::DBGCLAIMSET_EL1)
+          .Case("dbgclaimclr_el1", ARM64SYS::DBGCLAIMCLR_EL1)
+          .Case("dbgauthstatus_el1", ARM64SYS::DBGAUTHSTATUS_EL1)
+          .Case("dbgdevid2", ARM64SYS::DBGDEVID2)
+          .Case("dbgdevid1", ARM64SYS::DBGDEVID1)
+          .Case("dbgdevid0", ARM64SYS::DBGDEVID0)
+          .Case("id_pfr0_el1", ARM64SYS::ID_PFR0_EL1)
+          .Case("id_pfr1_el1", ARM64SYS::ID_PFR1_EL1)
+          .Case("id_dfr0_el1", ARM64SYS::ID_DFR0_EL1)
+          .Case("id_afr0_el1", ARM64SYS::ID_AFR0_EL1)
+          .Case("id_isar0_el1", ARM64SYS::ID_ISAR0_EL1)
+          .Case("id_isar1_el1", ARM64SYS::ID_ISAR1_EL1)
+          .Case("id_isar2_el1", ARM64SYS::ID_ISAR2_EL1)
+          .Case("id_isar3_el1", ARM64SYS::ID_ISAR3_EL1)
+          .Case("id_isar4_el1", ARM64SYS::ID_ISAR4_EL1)
+          .Case("id_isar5_el1", ARM64SYS::ID_ISAR5_EL1)
+          .Case("afsr1_el1", ARM64SYS::AFSR1_EL1)
+          .Case("afsr0_el1", ARM64SYS::AFSR0_EL1)
+          .Case("revidr_el1", ARM64SYS::REVIDR_EL1)
+          .Default(ARM64SYS::InvalidSystemReg);
+  if (Reg != ARM64SYS::InvalidSystemReg) {
+    // We matched a reg name, so create the operand.
+    Operands.push_back(
+        ARM64Operand::CreateSystemRegister(Reg, getLoc(), getContext()));
+    Parser.Lex(); // Consume the register name.
+    return MatchOperand_Success;
+  }
+
+  // Or we may have an identifier that encodes the sub-operands.
+  // For example, s3_2_c15_c0_0.
+  unsigned op0, op1, CRn, CRm, op2;
+  std::string Desc = ID;
+  if (std::sscanf(Desc.c_str(), "s%u_%u_c%u_c%u_%u", &op0, &op1, &CRn, &CRm,
+                  &op2) != 5)
+    return MatchOperand_NoMatch;
+  if ((op0 != 2 && op0 != 3) || op1 > 7 || CRn > 15 || CRm > 15 || op2 > 7)
+    return MatchOperand_NoMatch;
+
+  unsigned Val = op0 << 14 | op1 << 11 | CRn << 7 | CRm << 3 | op2;
+  Operands.push_back(
+      ARM64Operand::CreateSystemRegister(Val, getLoc(), getContext()));
+  Parser.Lex(); // Consume the register name.
+
+  return MatchOperand_Success;
+}
+
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseCPSRField(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  ARM64SYS::CPSRField Field =
+      StringSwitch<ARM64SYS::CPSRField>(Tok.getString().lower())
+          .Case("spsel", ARM64SYS::cpsr_SPSel)
+          .Case("daifset", ARM64SYS::cpsr_DAIFSet)
+          .Case("daifclr", ARM64SYS::cpsr_DAIFClr)
+          .Default(ARM64SYS::InvalidCPSRField);
+  if (Field == ARM64SYS::InvalidCPSRField)
+    return MatchOperand_NoMatch;
+  Operands.push_back(
+      ARM64Operand::CreateCPSRField(Field, getLoc(), getContext()));
+  Parser.Lex(); // Consume the register name.
+
+  return MatchOperand_Success;
+}
+
+/// tryParseVectorRegister - Parse a vector register operand.
+bool ARM64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
+
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      ARM64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(ARM64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
+                                                       getContext()));
+  }
+
+  return false;
+}
+
+/// parseRegister - Parse a non-vector register operand.
+bool ARM64AsmParser::parseRegister(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
+    return false;
+
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      ARM64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  if (getLexer().getKind() == AsmToken::LBrac) {
+    SMLoc LBracS = getLoc();
+    Parser.Lex();
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        if (getLexer().getKind() == AsmToken::RBrac) {
+          SMLoc RBracS = getLoc();
+          Parser.Lex();
+          Operands.push_back(
+              ARM64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              ARM64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              ARM64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// tryParseNoIndexMemory - Custom parser method for memory operands that
+///                         do not allow base regisrer writeback modes,
+///                         or those that handle writeback separately from
+///                         the memory operand (like the AdvSIMD ldX/stX
+///                         instructions.
+ARM64AsmParser::OperandMatchResultTy
+ARM64AsmParser::tryParseNoIndexMemory(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::LBrac))
+    return MatchOperand_NoMatch;
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = Parser.getTok();
+  if (BaseRegTok.isNot(AsmToken::Identifier)) {
+    Error(BaseRegTok.getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1) {
+    Error(BaseRegTok.getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = getLoc();
+  if (Parser.getTok().isNot(AsmToken::RBrac)) {
+    Error(E, "']' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat right bracket token.
+
+  Operands.push_back(ARM64Operand::CreateMem(Reg, 0, S, E, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseMemory - Parse a memory operand for a basic load/store instruction.
+bool ARM64AsmParser::parseMemory(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = Parser.getTok();
+  if (BaseRegTok.isNot(AsmToken::Identifier))
+    return Error(BaseRegTok.getLoc(), "register expected");
+
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
+
+  // If there is an offset expression, parse it.
+  const MCExpr *OffsetExpr = 0;
+  SMLoc OffsetLoc;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+    OffsetLoc = getLoc();
+
+    // Register offset
+    const AsmToken &OffsetRegTok = Parser.getTok();
+    int Reg2 = OffsetRegTok.is(AsmToken::Identifier) ? tryParseRegister() : -1;
+    if (Reg2 != -1) {
+      // Default shift is LSL, with an omitted shift.  We use the third bit of
+      // the extend value to indicate presence/omission of the immediate offset.
+      ARM64_AM::ExtendType ExtOp = ARM64_AM::UXTX;
+      int64_t ShiftVal = 0;
+      bool ExplicitShift = false;
+
+      if (Parser.getTok().is(AsmToken::Comma)) {
+        // Embedded extend operand.
+        Parser.Lex(); // Eat the comma
+
+        SMLoc ExtLoc = getLoc();
+        const AsmToken &Tok = Parser.getTok();
+        ExtOp = StringSwitch<ARM64_AM::ExtendType>(Tok.getString())
+                    .Case("uxtw", ARM64_AM::UXTW)
+                    .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
+                    .Case("sxtw", ARM64_AM::SXTW)
+                    .Case("sxtx", ARM64_AM::SXTX)
+                    .Case("UXTW", ARM64_AM::UXTW)
+                    .Case("LSL", ARM64_AM::UXTX) // Alias for UXTX
+                    .Case("SXTW", ARM64_AM::SXTW)
+                    .Case("SXTX", ARM64_AM::SXTX)
+                    .Default(ARM64_AM::InvalidExtend);
+        if (ExtOp == ARM64_AM::InvalidExtend)
+          return Error(ExtLoc, "expected valid extend operation");
+
+        Parser.Lex(); // Eat the extend op.
+
+        if (getLexer().is(AsmToken::RBrac)) {
+          // No immediate operand.
+          if (ExtOp == ARM64_AM::UXTX)
+            return Error(ExtLoc, "LSL extend requires immediate operand");
+        } else if (getLexer().is(AsmToken::Hash)) {
+          // Immediate operand.
+          Parser.Lex(); // Eat the '#'
+          const MCExpr *ImmVal;
+          SMLoc ExprLoc = getLoc();
+          if (getParser().parseExpression(ImmVal))
+            return true;
+          const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+          if (!MCE)
+            return TokError("immediate value expected for extend operand");
+
+          ExplicitShift = true;
+          ShiftVal = MCE->getValue();
+          if (ShiftVal < 0 || ShiftVal > 4)
+            return Error(ExprLoc, "immediate operand out of range");
+        } else
+          return Error(getLoc(), "expected immediate operand");
+      }
+
+      if (Parser.getTok().isNot(AsmToken::RBrac))
+        return Error(getLoc(), "']' expected");
+
+      Parser.Lex(); // Eat right bracket token.
+
+      SMLoc E = getLoc();
+      Operands.push_back(ARM64Operand::CreateRegOffsetMem(
+          Reg, Reg2, ExtOp, ShiftVal, ExplicitShift, S, E, getContext()));
+      return false;
+
+      // Immediate expressions.
+    } else if (Parser.getTok().is(AsmToken::Hash)) {
+      Parser.Lex(); // Eat hash token.
+
+      if (parseSymbolicImmVal(OffsetExpr))
+        return true;
+    } else {
+      // FIXME: We really should make sure that we're dealing with a LDR/STR
+      // instruction that can legally have a symbolic expression here.
+      // Symbol reference.
+      if (Parser.getTok().isNot(AsmToken::Identifier) &&
+          Parser.getTok().isNot(AsmToken::String))
+        return Error(getLoc(), "identifier or immediate expression expected");
+      if (getParser().parseExpression(OffsetExpr))
+        return true;
+      // If this is a plain ref, Make sure a legal variant kind was specified.
+      // Otherwise, it's a more complicated expression and we have to just
+      // assume it's OK and let the relocation stuff puke if it's not.
+      ARM64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      const MCConstantExpr *Addend;
+      if (classifySymbolRef(OffsetExpr, ELFRefKind, DarwinRefKind, Addend) &&
+          Addend == 0) {
+        assert(ELFRefKind == ARM64MCExpr::VK_INVALID &&
+               "ELF symbol modifiers not supported here yet");
+
+        switch (DarwinRefKind) {
+        default:
+          return Error(getLoc(), "expected @pageoff or @gotpageoff modifier");
+        case MCSymbolRefExpr::VK_GOTPAGEOFF:
+        case MCSymbolRefExpr::VK_PAGEOFF:
+        case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+          // These are what we're expecting.
+          break;
+        }
+      }
+    }
+  }
+
+  SMLoc E = getLoc();
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return Error(E, "']' expected");
+
+  Parser.Lex(); // Eat right bracket token.
+
+  // Create the memory operand.
+  Operands.push_back(
+      ARM64Operand::CreateMem(Reg, OffsetExpr, S, E, OffsetLoc, getContext()));
+
+  // Check for a '!', indicating pre-indexed addressing with writeback.
+  if (Parser.getTok().is(AsmToken::Exclaim)) {
+    // There needs to have been an immediate or wback doesn't make sense.
+    if (!OffsetExpr)
+      return Error(E, "missing offset for pre-indexed addressing");
+    // Pre-indexed with writeback must have a constant expression for the
+    // offset. FIXME: Theoretically, we'd like to allow fixups so long
+    // as they don't require a relocation.
+    if (!isa<MCConstantExpr>(OffsetExpr))
+      return Error(OffsetLoc, "constant immediate expression expected");
+
+    // Create the Token operand for the '!'.
+    Operands.push_back(ARM64Operand::CreateToken(
+        "!", false, Parser.getTok().getLoc(), getContext()));
+    Parser.Lex(); // Eat the '!' token.
+  }
+
+  return false;
+}
+
+bool ARM64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  bool HasELFModifier = false;
+  ARM64MCExpr::VariantKind RefKind;
+
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat ':"
+    HasELFModifier = true;
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
+
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<ARM64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", ARM64MCExpr::VK_LO12)
+                  .Case("abs_g3", ARM64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", ARM64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_nc", ARM64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", ARM64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_nc", ARM64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", ARM64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_nc", ARM64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", ARM64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", ARM64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", ARM64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", ARM64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", ARM64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_lo12", ARM64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", ARM64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", ARM64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", ARM64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", ARM64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", ARM64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", ARM64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_lo12", ARM64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", ARM64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", ARM64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", ARM64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", ARM64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", ARM64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", ARM64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", ARM64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", ARM64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", ARM64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(ARM64MCExpr::VK_INVALID);
+
+    if (RefKind == ARM64MCExpr::VK_INVALID) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
+
+    Parser.Lex(); // Eat identifier
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+      return true;
+    }
+    Parser.Lex(); // Eat ':'
+  }
+
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  if (HasELFModifier)
+    ImmVal = ARM64MCExpr::Create(ImmVal, RefKind, getContext());
+
+  return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool ARM64AsmParser::parseVectorList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind);
+  if (FirstReg == -1)
+    return Error(getLoc(), "vector register expected");
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
+  while (Parser.getTok().isNot(AsmToken::RCurly)) {
+    if (Parser.getTok().is(AsmToken::EndOfStatement))
+      Error(getLoc(), "'}' expected");
+
+    if (Parser.getTok().isNot(AsmToken::Comma))
+      return Error(getLoc(), "',' expected");
+    Parser.Lex(); // Eat the comma token.
+
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind);
+    if (Reg == -1)
+      return Error(Loc, "vector register expected");
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
+
+    // Registers must be incremental (with wraparound at 31)
+    if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+        (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+      return Error(Loc, "registers must be sequential");
+
+    PrevReg = Reg;
+    ++Count;
+  }
+  Parser.Lex(); // Eat the '}' token.
+
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
+
+  Operands.push_back(ARM64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+  // If there is an index specifier following the list, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
+                                                       getContext()));
+  }
+  return false;
+}
+
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool ARM64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::LBrac:
+    return parseMemory(Operands);
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" operand.
+    if (!parseOptionalShift(Operands))
+      return false;
+
+    // Or maybe it could be an optional "extend" operand.
+    if (!parseOptionalExtend(Operands))
+      return false;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(ARM64Operand::CreateImm(IdVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+    Parser.Lex();
+
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (IntVal != 0 || (Mnemonic != "fcmp" && Mnemonic != "fcmpe"))
+        return TokError("unexpected floating point literal");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          ARM64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          ARM64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
+    }
+
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(ARM64Operand::CreateImm(ImmVal, S, E, getContext()));
+    return false;
+  }
+  }
+}
+
+/// ParseInstruction - Parse an ARM64 instruction mnemonic followed by its
+/// operands.
+bool ARM64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi")
+    return parseSysAlias(Head, NameLoc, Operands);
+
+  Operands.push_back(
+      ARM64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    unsigned CC = parseCondCodeString(Head);
+    if (CC == ~0U)
+      return Error(SuffixLoc, "invalid condition code");
+    const MCExpr *CCExpr = MCConstantExpr::Create(CC, getContext());
+    Operands.push_back(
+        ARM64Operand::CreateImm(CCExpr, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        ARM64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, false, false)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    unsigned N = 2;
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+
+      // Parse and remember the operand.
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+
+      ++N;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+/// isFPR32Register - Check if a register is in the FPR32 register class.
+/// (The parser does not have the target register info to check the register
+/// class directly.)
+static bool isFPR32Register(unsigned Reg) {
+  using namespace ARM64;
+  switch (Reg) {
+  default:
+    break;
+  case S0:  case S1:  case S2:  case S3:  case S4:  case S5:  case S6:
+  case S7:  case S8:  case S9:  case S10:  case S11:  case S12:  case S13:
+  case S14:  case S15:  case S16:  case S17:  case S18:  case S19:  case S20:
+  case S21:  case S22:  case S23:  case S24:  case S25:  case S26:  case S27:
+  case S28:  case S29:  case S30:  case S31:
+    return true;
+  }
+  return false;
+}
+
+/// isGPR32Register - Check if a register is in the GPR32sp register class.
+/// (The parser does not have the target register info to check the register
+/// class directly.)
+static bool isGPR32Register(unsigned Reg) {
+  using namespace ARM64;
+  switch (Reg) {
+  default:
+    break;
+  case W0:  case W1:  case W2:  case W3:  case W4:  case W5:  case W6:
+  case W7:  case W8:  case W9:  case W10:  case W11:  case W12:  case W13:
+  case W14:  case W15:  case W16:  case W17:  case W18:  case W19:  case W20:
+  case W21:  case W22:  case W23:  case W24:  case W25:  case W26:  case W27:
+  case W28:  case W29:  case W30:  case WSP:
+    return true;
+  }
+  return false;
+}
+
+static bool isGPR64Reg(unsigned Reg) {
+  using namespace ARM64;
+  switch (Reg) {
+  case X0:  case X1:  case X2:  case X3:  case X4:  case X5:  case X6:
+  case X7:  case X8:  case X9:  case X10:  case X11:  case X12:  case X13:
+  case X14:  case X15:  case X16:  case X17:  case X18:  case X19:  case X20:
+  case X21:  case X22:  case X23:  case X24:  case X25:  case X26:  case X27:
+  case X28:  case FP:  case LR:  case SP:  case XZR:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool ARM64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case ARM64::LDPSWpre:
+  case ARM64::LDPWpost:
+  case ARM64::LDPWpre:
+  case ARM64::LDPXpost:
+  case ARM64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    // FALLTHROUGH
+  }
+  case ARM64::LDPDpost:
+  case ARM64::LDPDpre:
+  case ARM64::LDPQpost:
+  case ARM64::LDPQpre:
+  case ARM64::LDPSpost:
+  case ARM64::LDPSpre:
+  case ARM64::LDPSWpost:
+  case ARM64::LDPDi:
+  case ARM64::LDPQi:
+  case ARM64::LDPSi:
+  case ARM64::LDPSWi:
+  case ARM64::LDPWi:
+  case ARM64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case ARM64::STPDpost:
+  case ARM64::STPDpre:
+  case ARM64::STPQpost:
+  case ARM64::STPQpre:
+  case ARM64::STPSpost:
+  case ARM64::STPSpre:
+  case ARM64::STPWpost:
+  case ARM64::STPWpre:
+  case ARM64::STPXpost:
+  case ARM64::STPXpre: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case ARM64::LDRBBpre:
+  case ARM64::LDRBpre:
+  case ARM64::LDRHHpre:
+  case ARM64::LDRHpre:
+  case ARM64::LDRSBWpre:
+  case ARM64::LDRSBXpre:
+  case ARM64::LDRSHWpre:
+  case ARM64::LDRSHXpre:
+  case ARM64::LDRSWpre:
+  case ARM64::LDRWpre:
+  case ARM64::LDRXpre:
+  case ARM64::LDRBBpost:
+  case ARM64::LDRBpost:
+  case ARM64::LDRHHpost:
+  case ARM64::LDRHpost:
+  case ARM64::LDRSBWpost:
+  case ARM64::LDRSBXpost:
+  case ARM64::LDRSHWpost:
+  case ARM64::LDRSHXpost:
+  case ARM64::LDRSWpost:
+  case ARM64::LDRWpost:
+  case ARM64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rn = Inst.getOperand(1).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case ARM64::STRBBpost:
+  case ARM64::STRBpost:
+  case ARM64::STRHHpost:
+  case ARM64::STRHpost:
+  case ARM64::STRWpost:
+  case ARM64::STRXpost:
+  case ARM64::STRBBpre:
+  case ARM64::STRBpre:
+  case ARM64::STRHHpre:
+  case ARM64::STRHpre:
+  case ARM64::STRWpre:
+  case ARM64::STRXpre: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rn = Inst.getOperand(1).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case ARM64::ANDWrs:
+  case ARM64::ANDSWrs:
+  case ARM64::EORWrs:
+  case ARM64::ORRWrs: {
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[3], "immediate value expected");
+    int64_t shifter = Inst.getOperand(3).getImm();
+    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(shifter);
+    if (ST == ARM64_AM::LSL && shifter > 31)
+      return Error(Loc[3], "shift value out of range");
+    return false;
+  }
+  case ARM64::ADDSWri:
+  case ARM64::ADDSXri:
+  case ARM64::ADDWri:
+  case ARM64::ADDXri:
+  case ARM64::SUBSWri:
+  case ARM64::SUBSXri:
+  case ARM64::SUBWri:
+  case ARM64::SUBXri: {
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[3], "immediate value expected");
+    int64_t shifter = Inst.getOperand(3).getImm();
+    if (shifter != 0 && shifter != 12)
+      return Error(Loc[3], "shift value out of range");
+    // The imm12 operand can be an expression. Validate that it's legit.
+    // FIXME: We really, really want to allow arbitrary expressions here
+    // and resolve the value and validate the result at fixup time, but
+    // that's hard as we have long since lost any source information we
+    // need to generate good diagnostics by that point.
+    if (Inst.getOpcode() == ARM64::ADDXri && Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      ARM64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      const MCConstantExpr *Addend;
+      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+        return Error(Loc[2], "invalid immediate expression");
+      }
+
+      if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF ||
+          ELFRefKind == ARM64MCExpr::VK_LO12 ||
+          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
+          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
+          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
+          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
+          ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
+        // Note that we don't range-check the addend. It's adjusted
+        // modulo page size when converted, so there is no "out of range"
+        // condition when using @pageoff. Any validity checking for the value
+        // was done in the is*() predicate function.
+        return false;
+      } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF) {
+        // @gotpageoff can only be used directly, not with an addend.
+        return Addend != 0;
+      }
+
+      // Otherwise, we're not sure, so don't allow it for now.
+      return Error(Loc[2], "invalid immediate expression");
+    }
+
+    // If it's anything but an immediate, it's not legit.
+    if (!Inst.getOperand(2).isImm())
+      return Error(Loc[2], "invalid immediate expression");
+    int64_t imm = Inst.getOperand(2).getImm();
+    if (imm > 4095 || imm < 0)
+      return Error(Loc[2], "immediate value out of range");
+    return false;
+  }
+  case ARM64::LDRBpre:
+  case ARM64::LDRHpre:
+  case ARM64::LDRSBWpre:
+  case ARM64::LDRSBXpre:
+  case ARM64::LDRSHWpre:
+  case ARM64::LDRSHXpre:
+  case ARM64::LDRWpre:
+  case ARM64::LDRXpre:
+  case ARM64::LDRSpre:
+  case ARM64::LDRDpre:
+  case ARM64::LDRQpre:
+  case ARM64::STRBpre:
+  case ARM64::STRHpre:
+  case ARM64::STRWpre:
+  case ARM64::STRXpre:
+  case ARM64::STRSpre:
+  case ARM64::STRDpre:
+  case ARM64::STRQpre:
+  case ARM64::LDRBpost:
+  case ARM64::LDRHpost:
+  case ARM64::LDRSBWpost:
+  case ARM64::LDRSBXpost:
+  case ARM64::LDRSHWpost:
+  case ARM64::LDRSHXpost:
+  case ARM64::LDRWpost:
+  case ARM64::LDRXpost:
+  case ARM64::LDRSpost:
+  case ARM64::LDRDpost:
+  case ARM64::LDRQpost:
+  case ARM64::STRBpost:
+  case ARM64::STRHpost:
+  case ARM64::STRWpost:
+  case ARM64::STRXpost:
+  case ARM64::STRSpost:
+  case ARM64::STRDpost:
+  case ARM64::STRQpost:
+  case ARM64::LDTRXi:
+  case ARM64::LDTRWi:
+  case ARM64::LDTRHi:
+  case ARM64::LDTRBi:
+  case ARM64::LDTRSHWi:
+  case ARM64::LDTRSHXi:
+  case ARM64::LDTRSBWi:
+  case ARM64::LDTRSBXi:
+  case ARM64::LDTRSWi:
+  case ARM64::STTRWi:
+  case ARM64::STTRXi:
+  case ARM64::STTRHi:
+  case ARM64::STTRBi:
+  case ARM64::LDURWi:
+  case ARM64::LDURXi:
+  case ARM64::LDURSi:
+  case ARM64::LDURDi:
+  case ARM64::LDURQi:
+  case ARM64::LDURHi:
+  case ARM64::LDURBi:
+  case ARM64::LDURSHWi:
+  case ARM64::LDURSHXi:
+  case ARM64::LDURSBWi:
+  case ARM64::LDURSBXi:
+  case ARM64::LDURSWi:
+  case ARM64::PRFUMi:
+  case ARM64::STURWi:
+  case ARM64::STURXi:
+  case ARM64::STURSi:
+  case ARM64::STURDi:
+  case ARM64::STURQi:
+  case ARM64::STURHi:
+  case ARM64::STURBi: {
+    // FIXME: Should accept expressions and error in fixup evaluation
+    // if out of range.
+    if (!Inst.getOperand(2).isImm())
+      return Error(Loc[1], "immediate value expected");
+    int64_t offset = Inst.getOperand(2).getImm();
+    if (offset > 255 || offset < -256)
+      return Error(Loc[1], "offset value out of range");
+    return false;
+  }
+  case ARM64::LDRSro:
+  case ARM64::LDRWro:
+  case ARM64::LDRSWro:
+  case ARM64::STRWro:
+  case ARM64::STRSro: {
+    // FIXME: Should accept expressions and error in fixup evaluation
+    // if out of range.
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[1], "immediate value expected");
+    int64_t shift = Inst.getOperand(3).getImm();
+    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
+    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
+        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
+      return Error(Loc[1], "shift type invalid");
+    return false;
+  }
+  case ARM64::LDRDro:
+  case ARM64::LDRQro:
+  case ARM64::LDRXro:
+  case ARM64::PRFMro:
+  case ARM64::STRXro:
+  case ARM64::STRDro:
+  case ARM64::STRQro: {
+    // FIXME: Should accept expressions and error in fixup evaluation
+    // if out of range.
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[1], "immediate value expected");
+    int64_t shift = Inst.getOperand(3).getImm();
+    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
+    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
+        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
+      return Error(Loc[1], "shift type invalid");
+    return false;
+  }
+  case ARM64::LDRHro:
+  case ARM64::LDRHHro:
+  case ARM64::LDRSHWro:
+  case ARM64::LDRSHXro:
+  case ARM64::STRHro:
+  case ARM64::STRHHro: {
+    // FIXME: Should accept expressions and error in fixup evaluation
+    // if out of range.
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[1], "immediate value expected");
+    int64_t shift = Inst.getOperand(3).getImm();
+    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
+    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
+        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
+      return Error(Loc[1], "shift type invalid");
+    return false;
+  }
+  case ARM64::LDRBro:
+  case ARM64::LDRBBro:
+  case ARM64::LDRSBWro:
+  case ARM64::LDRSBXro:
+  case ARM64::STRBro:
+  case ARM64::STRBBro: {
+    // FIXME: Should accept expressions and error in fixup evaluation
+    // if out of range.
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[1], "immediate value expected");
+    int64_t shift = Inst.getOperand(3).getImm();
+    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
+    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
+        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
+      return Error(Loc[1], "shift type invalid");
+    return false;
+  }
+  case ARM64::LDPWi:
+  case ARM64::LDPXi:
+  case ARM64::LDPSi:
+  case ARM64::LDPDi:
+  case ARM64::LDPQi:
+  case ARM64::LDPSWi:
+  case ARM64::STPWi:
+  case ARM64::STPXi:
+  case ARM64::STPSi:
+  case ARM64::STPDi:
+  case ARM64::STPQi:
+  case ARM64::LDPWpre:
+  case ARM64::LDPXpre:
+  case ARM64::LDPSpre:
+  case ARM64::LDPDpre:
+  case ARM64::LDPQpre:
+  case ARM64::LDPSWpre:
+  case ARM64::STPWpre:
+  case ARM64::STPXpre:
+  case ARM64::STPSpre:
+  case ARM64::STPDpre:
+  case ARM64::STPQpre:
+  case ARM64::LDPWpost:
+  case ARM64::LDPXpost:
+  case ARM64::LDPSpost:
+  case ARM64::LDPDpost:
+  case ARM64::LDPQpost:
+  case ARM64::LDPSWpost:
+  case ARM64::STPWpost:
+  case ARM64::STPXpost:
+  case ARM64::STPSpost:
+  case ARM64::STPDpost:
+  case ARM64::STPQpost:
+  case ARM64::LDNPWi:
+  case ARM64::LDNPXi:
+  case ARM64::LDNPSi:
+  case ARM64::LDNPDi:
+  case ARM64::LDNPQi:
+  case ARM64::STNPWi:
+  case ARM64::STNPXi:
+  case ARM64::STNPSi:
+  case ARM64::STNPDi:
+  case ARM64::STNPQi: {
+    // FIXME: Should accept expressions and error in fixup evaluation
+    // if out of range.
+    if (!Inst.getOperand(3).isImm())
+      return Error(Loc[2], "immediate value expected");
+    int64_t offset = Inst.getOperand(3).getImm();
+    if (offset > 63 || offset < -64)
+      return Error(Loc[2], "offset value out of range");
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+static void rewriteMOV(ARM64AsmParser::OperandVector &Operands,
+                       StringRef mnemonic, uint64_t imm, unsigned shift,
+                       MCContext &Context) {
+  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
+  ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
+  Operands[0] =
+      ARM64Operand::CreateToken(mnemonic, false, Op->getStartLoc(), Context);
+
+  const MCExpr *NewImm = MCConstantExpr::Create(imm >> shift, Context);
+  Operands[2] = ARM64Operand::CreateImm(NewImm, Op2->getStartLoc(),
+                                        Op2->getEndLoc(), Context);
+
+  Operands.push_back(ARM64Operand::CreateShifter(
+      ARM64_AM::LSL, shift, Op2->getStartLoc(), Op2->getEndLoc(), Context));
+  delete Op2;
+  delete Op;
+}
+
+bool ARM64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256,255].");
+  case Match_InvalidMemoryIndexed32SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256,252].");
+  case Match_InvalidMemoryIndexed64SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512,504].");
+  case Match_InvalidMemoryIndexed128SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024,1008].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be an integer in range [0,4095].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 2 in range [0,8190].");
+  case Match_InvalidMemoryIndexed32:
+    return Error(Loc, "index must be a multiple of 4 in range [0,16380].");
+  case Match_InvalidMemoryIndexed64:
+    return Error(Loc, "index must be a multiple of 8 in range [0,32760].");
+  case Match_InvalidMemoryIndexed128:
+    return Error(Loc, "index must be a multiple of 16 in range [0,65520].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1,8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1,16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1,32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1,64].");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    assert(0 && "unexpected error code!");
+    return Error(Loc, "invalid instruction format");
+  }
+}
+
+bool ARM64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             unsigned &ErrorInfo,
+                                             bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
+  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op->getToken();
+  // Translate CMN/CMP pseudos to ADDS/SUBS with zero register destination.
+  // This needs to be done before the special handling of ADD/SUB immediates.
+  if (Tok == "cmp" || Tok == "cmn") {
+    // Replace the opcode with either ADDS or SUBS.
+    const char *Repl = StringSwitch<const char *>(Tok)
+                           .Case("cmp", "subs")
+                           .Case("cmn", "adds")
+                           .Default(0);
+    assert(Repl && "Unknown compare instruction");
+    delete Operands[0];
+    Operands[0] = ARM64Operand::CreateToken(Repl, false, IDLoc, getContext());
+
+    // Insert WZR or XZR as destination operand.
+    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
+    unsigned ZeroReg;
+    if (RegOp->isReg() &&
+        (isGPR32Register(RegOp->getReg()) || RegOp->getReg() == ARM64::WZR))
+      ZeroReg = ARM64::WZR;
+    else
+      ZeroReg = ARM64::XZR;
+    Operands.insert(
+        Operands.begin() + 1,
+        ARM64Operand::CreateReg(ZeroReg, false, IDLoc, IDLoc, getContext()));
+    // Update since we modified it above.
+    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
+    Tok = Op->getToken();
+  }
+
+  unsigned NumOperands = Operands.size();
+
+  if (Tok == "mov" && NumOperands == 3) {
+    // The MOV mnemomic is aliased to movn/movz, depending on the value of
+    // the immediate being instantiated.
+    // FIXME: Catching this here is a total hack, and we should use tblgen
+    // support to implement this instead as soon as it is available.
+
+    ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
+    if (Op2->isImm()) {
+      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op2->getImm())) {
+        uint64_t Val = CE->getValue();
+        uint64_t NVal = ~Val;
+
+        // If this is a 32-bit register and the value has none of the upper
+        // set, clear the complemented upper 32-bits so the logic below works
+        // for 32-bit registers too.
+        ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
+        if (Op1->isReg() && isGPR32Register(Op1->getReg()) &&
+            (Val & 0xFFFFFFFFULL) == Val)
+          NVal &= 0x00000000FFFFFFFFULL;
+
+        // MOVK Rd, imm << 0
+        if ((Val & 0xFFFF) == Val)
+          rewriteMOV(Operands, "movz", Val, 0, getContext());
+
+        // MOVK Rd, imm << 16
+        else if ((Val & 0xFFFF0000ULL) == Val)
+          rewriteMOV(Operands, "movz", Val, 16, getContext());
+
+        // MOVK Rd, imm << 32
+        else if ((Val & 0xFFFF00000000ULL) == Val)
+          rewriteMOV(Operands, "movz", Val, 32, getContext());
+
+        // MOVK Rd, imm << 48
+        else if ((Val & 0xFFFF000000000000ULL) == Val)
+          rewriteMOV(Operands, "movz", Val, 48, getContext());
+
+        // MOVN Rd, (~imm << 0)
+        else if ((NVal & 0xFFFFULL) == NVal)
+          rewriteMOV(Operands, "movn", NVal, 0, getContext());
+
+        // MOVN Rd, ~(imm << 16)
+        else if ((NVal & 0xFFFF0000ULL) == NVal)
+          rewriteMOV(Operands, "movn", NVal, 16, getContext());
+
+        // MOVN Rd, ~(imm << 32)
+        else if ((NVal & 0xFFFF00000000ULL) == NVal)
+          rewriteMOV(Operands, "movn", NVal, 32, getContext());
+
+        // MOVN Rd, ~(imm << 48)
+        else if ((NVal & 0xFFFF000000000000ULL) == NVal)
+          rewriteMOV(Operands, "movn", NVal, 48, getContext());
+      }
+    }
+  } else if (NumOperands == 4) {
+    if (Tok == "add" || Tok == "adds" || Tok == "sub" || Tok == "subs") {
+      // Handle the uimm24 immediate form, where the shift is not specified.
+      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
+      if (Op3->isImm()) {
+        if (const MCConstantExpr *CE =
+                dyn_cast<MCConstantExpr>(Op3->getImm())) {
+          uint64_t Val = CE->getValue();
+          if (Val >= (1 << 24)) {
+            Error(IDLoc, "immediate value is too large");
+            return true;
+          }
+          if (Val < (1 << 12)) {
+            Operands.push_back(ARM64Operand::CreateShifter(
+                ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
+          } else if ((Val & 0xfff) == 0) {
+            delete Operands[3];
+            CE = MCConstantExpr::Create(Val >> 12, getContext());
+            Operands[3] =
+                ARM64Operand::CreateImm(CE, IDLoc, IDLoc, getContext());
+            Operands.push_back(ARM64Operand::CreateShifter(
+                ARM64_AM::LSL, 12, IDLoc, IDLoc, getContext()));
+          } else {
+            Error(IDLoc, "immediate value is too large");
+            return true;
+          }
+        } else {
+          Operands.push_back(ARM64Operand::CreateShifter(
+              ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
+        }
+      }
+
+      // FIXME: Horible hack to handle the LSL -> UBFM alias.
+    } else if (NumOperands == 4 && Tok == "lsl") {
+      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
+      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
+      if (Op2->isReg() && Op3->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        if (Op3CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t NewOp3Val = 0;
+          uint64_t NewOp4Val = 0;
+          if (isGPR32Register(Op2->getReg()) || Op2->getReg() == ARM64::WZR) {
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+            NewOp4Val = 31 - Op3Val;
+          } else {
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+            NewOp4Val = 63 - Op3Val;
+          }
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+
+          Operands[0] = ARM64Operand::CreateToken(
+              "ubfm", false, Op->getStartLoc(), getContext());
+          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
+                                                Op3->getEndLoc(), getContext());
+          Operands.push_back(ARM64Operand::CreateImm(
+              NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
+          delete Op3;
+          delete Op;
+        }
+      }
+
+      // FIXME: Horrible hack to handle the optional LSL shift for vector
+      //        instructions.
+    } else if (NumOperands == 4 && (Tok == "bic" || Tok == "orr")) {
+      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
+      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
+      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
+      if ((Op1->isToken() && Op2->isVectorReg() && Op3->isImm()) ||
+          (Op1->isVectorReg() && Op2->isToken() && Op3->isImm()))
+        Operands.push_back(ARM64Operand::CreateShifter(ARM64_AM::LSL, 0, IDLoc,
+                                                       IDLoc, getContext()));
+    } else if (NumOperands == 4 && (Tok == "movi" || Tok == "mvni")) {
+      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
+      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
+      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
+      if ((Op1->isToken() && Op2->isVectorReg() && Op3->isImm()) ||
+          (Op1->isVectorReg() && Op2->isToken() && Op3->isImm())) {
+        StringRef Suffix = Op1->isToken() ? Op1->getToken() : Op2->getToken();
+        // Canonicalize on lower-case for ease of comparison.
+        std::string CanonicalSuffix = Suffix.lower();
+        if (Tok != "movi" ||
+            (CanonicalSuffix != ".1d" && CanonicalSuffix != ".2d" &&
+             CanonicalSuffix != ".8b" && CanonicalSuffix != ".16b"))
+          Operands.push_back(ARM64Operand::CreateShifter(
+              ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
+      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
+      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t NewOp3Val = 0;
+          if (isGPR32Register(Op1->getReg()))
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
+                                                Op3->getEndLoc(), getContext());
+          Operands[4] = ARM64Operand::CreateImm(NewOp4, Op4->getStartLoc(),
+                                                Op4->getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = ARM64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = ARM64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = ARM64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op3;
+          delete Op4;
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
+      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
+      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= Op3Val) {
+            const MCExpr *NewOp4 =
+                MCConstantExpr::Create(NewOp4Val, getContext());
+            Operands[4] = ARM64Operand::CreateImm(
+                NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+            if (Tok == "bfxil")
+              Operands[0] = ARM64Operand::CreateToken(
+                  "bfm", false, Op->getStartLoc(), getContext());
+            else if (Tok == "sbfx")
+              Operands[0] = ARM64Operand::CreateToken(
+                  "sbfm", false, Op->getStartLoc(), getContext());
+            else if (Tok == "ubfx")
+              Operands[0] = ARM64Operand::CreateToken(
+                  "ubfm", false, Op->getStartLoc(), getContext());
+            else
+              llvm_unreachable("No valid mnemonic for alias?");
+
+            delete Op;
+            delete Op4;
+          }
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for tbz and tbnz with Wn register operand.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 4 && (Tok == "tbz" || Tok == "tbnz")) {
+    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
+    if (Op->isImm()) {
+      if (const MCConstantExpr *OpCE = dyn_cast<MCConstantExpr>(Op->getImm())) {
+        if (OpCE->getValue() < 32) {
+          // The source register can be Wn here, but the matcher expects a
+          // GPR64. Twiddle it here if necessary.
+          ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
+          if (Op->isReg()) {
+            unsigned Reg = getXRegFromWReg(Op->getReg());
+            Operands[1] = ARM64Operand::CreateReg(
+                Reg, false, Op->getStartLoc(), Op->getEndLoc(), getContext());
+            delete Op;
+          }
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
+    if (Op->isReg()) {
+      unsigned Reg = getXRegFromWReg(Op->getReg());
+      Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                            Op->getEndLoc(), getContext());
+      delete Op;
+    }
+  }
+  // FIXME: Likewise for [su]xt[bh] with a Xd dst operand
+  else if (NumOperands == 3 &&
+           (Tok == "sxtb" || Tok == "uxtb" || Tok == "sxth" || Tok == "uxth")) {
+    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
+    if (Op->isReg() && isGPR64Reg(Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
+      if (Op->isReg()) {
+        unsigned Reg = getXRegFromWReg(Op->getReg());
+        Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
+    ARM64Operand *ImmOp = static_cast<ARM64Operand *>(Operands[2]);
+    if (RegOp->isReg() && ImmOp->isFPImm() &&
+        ImmOp->getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          isFPR32Register(RegOp->getReg()) ? ARM64::WZR : ARM64::XZR;
+      Operands[2] = ARM64Operand::CreateReg(zreg, false, Op->getStartLoc(),
+                                            Op->getEndLoc(), getContext());
+      delete ImmOp;
+    }
+  }
+
+  // FIXME: Horrible hack to handle the literal .d[1] vector index on
+  // FMOV instructions. The index isn't an actual instruction operand
+  // but rather syntactic sugar. It really should be part of the mnemonic,
+  // not the operand, but whatever.
+  if ((NumOperands == 5) && Tok == "fmov") {
+    // If the last operand is a vectorindex of '1', then replace it with
+    // a '[' '1' ']' token sequence, which is what the matcher
+    // (annoyingly) expects for a literal vector index operand.
+    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[NumOperands - 1]);
+    if (Op->isVectorIndexD() && Op->getVectorIndex() == 1) {
+      SMLoc Loc = Op->getStartLoc();
+      Operands.pop_back();
+      Operands.push_back(
+          ARM64Operand::CreateToken("[", false, Loc, getContext()));
+      Operands.push_back(
+          ARM64Operand::CreateToken("1", false, Loc, getContext()));
+      Operands.push_back(
+          ARM64Operand::CreateToken("]", false, Loc, getContext()));
+    } else if (Op->isReg()) {
+      // Similarly, check the destination operand for the GPR->High-lane
+      // variant.
+      unsigned OpNo = NumOperands - 2;
+      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[OpNo]);
+      if (Op->isVectorIndexD() && Op->getVectorIndex() == 1) {
+        SMLoc Loc = Op->getStartLoc();
+        Operands[OpNo] =
+            ARM64Operand::CreateToken("[", false, Loc, getContext());
+        Operands.insert(
+            Operands.begin() + OpNo + 1,
+            ARM64Operand::CreateToken("1", false, Loc, getContext()));
+        Operands.insert(
+            Operands.begin() + OpNo + 2,
+            ARM64Operand::CreateToken("]", false, Loc, getContext()));
+      }
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success)
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+  case Match_MissingFeature:
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((ARM64Operand *)Operands[ErrorInfo])->isToken() &&
+        ((ARM64Operand *)Operands[ErrorInfo])->isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexedSImm9: {
+    // If there is not a '!' after the memory operand that failed, we really
+    // want the diagnostic for the non-pre-indexed instruction variant instead.
+    // Be careful to check for the post-indexed variant as well, which also
+    // uses this match diagnostic. Also exclude the explicitly unscaled
+    // mnemonics, as they want the unscaled diagnostic as well.
+    if (Operands.size() == ErrorInfo + 1 &&
+        !((ARM64Operand *)Operands[ErrorInfo])->isImm() &&
+        !Tok.startswith("stur") && !Tok.startswith("ldur")) {
+      // whether we want an Indexed64 or Indexed32 diagnostic depends on
+      // the register class of the previous operand. Default to 64 in case
+      // we see something unexpected.
+      MatchResult = Match_InvalidMemoryIndexed64;
+      if (ErrorInfo) {
+        ARM64Operand *PrevOp = (ARM64Operand *)Operands[ErrorInfo - 1];
+        if (PrevOp->isReg() && ARM64MCRegisterClasses[ARM64::GPR32RegClassID]
+                                   .contains(PrevOp->getReg()))
+          MatchResult = Match_InvalidMemoryIndexed32;
+      }
+    }
+    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed32:
+  case Match_InvalidMemoryIndexed64:
+  case Match_InvalidMemoryIndexed128:
+    // If there is a '!' after the memory operand that failed, we really
+    // want the diagnostic for the pre-indexed instruction variant instead.
+    if (Operands.size() > ErrorInfo + 1 &&
+        ((ARM64Operand *)Operands[ErrorInfo + 1])->isTokenEqual("!"))
+      MatchResult = Match_InvalidMemoryIndexedSImm9;
+  // FALL THROUGH
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidMemoryIndexed32SImm7:
+  case Match_InvalidMemoryIndexed64SImm7:
+  case Match_InvalidMemoryIndexed128SImm7:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64: {
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
+    // If it's a memory operand, the error is with the offset immediate,
+    // so get that location instead.
+    if (((ARM64Operand *)Operands[ErrorInfo])->isMem())
+      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getOffsetLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+/// ParseDirective parses the arm specific directives
+bool ARM64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
+  if (IDVal == ".hword")
+    return parseDirectiveWord(2, Loc);
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, Loc);
+  if (IDVal == ".xword")
+    return parseDirectiveWord(8, Loc);
+  if (IDVal == ".tlsdesccall")
+    return parseDirectiveTLSDescCall(Loc);
+
+  return parseDirectiveLOH(IDVal, Loc);
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool ARM64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+// parseDirectiveTLSDescCall:
+//   ::= .tlsdesccall symbol
+bool ARM64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_TLSDESC, getContext());
+
+  MCInst Inst;
+  Inst.setOpcode(ARM64::TLSDESCCALL);
+  Inst.addOperand(MCOperand::CreateExpr(Expr));
+
+  getParser().getStreamer().EmitInstruction(Inst, STI);
+  return false;
+}
+
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool ARM64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  if (IDVal != MCLOHDirectiveName())
+    return true;
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    Kind = (MCLOHType)Id;
+    // Check that Id does not overflow MCLOHType.
+    if (!isValidMCLOHType(Kind) || Id != Kind)
+      return TokError("invalid numeric identifier in directive");
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
+
+bool
+ARM64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                  ARM64MCExpr::VariantKind &ELFRefKind,
+                                  MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                  const MCConstantExpr *&Addend) {
+  ELFRefKind = ARM64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+
+  if (const ARM64MCExpr *AE = dyn_cast<ARM64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    Addend = 0;
+    return true;
+  }
+
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
+
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
+    return false;
+  DarwinRefKind = SE->getKind();
+
+  if (BE->getOpcode() != MCBinaryExpr::Add)
+    return false;
+
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  Addend = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!Addend)
+    return false;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == ARM64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeARM64AsmParser() {
+  RegisterMCAsmParser<ARM64AsmParser> X(TheARM64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "ARM64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned ARM64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                    unsigned Kind) {
+  ARM64Operand *Op = static_cast<ARM64Operand *>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
+  switch (Kind) {
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
+    break;
+  case MCK__35_1:
+    ExpectedVal = 1;
+    break;
+  case MCK__35_12:
+    ExpectedVal = 12;
+    break;
+  case MCK__35_16:
+    ExpectedVal = 16;
+    break;
+  case MCK__35_2:
+    ExpectedVal = 2;
+    break;
+  case MCK__35_24:
+    ExpectedVal = 24;
+    break;
+  case MCK__35_3:
+    ExpectedVal = 3;
+    break;
+  case MCK__35_32:
+    ExpectedVal = 32;
+    break;
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
+    break;
+  }
+  if (!Op->isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
+}
diff --git a/lib/Target/ARM64/AsmParser/CMakeLists.txt b/lib/Target/ARM64/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..826158b
--- /dev/null
+++ b/lib/Target/ARM64/AsmParser/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64AsmParser
+  ARM64AsmParser.cpp
+  )
+
diff --git a/lib/Target/ARM64/AsmParser/LLVMBuild.txt b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000..2c8fafe
--- /dev/null
+++ b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64AsmParser
+parent = ARM64
+required_libraries = ARM64Desc ARM64Info MC MCParser Support
+add_to_library_groups = ARM64
+
diff --git a/lib/Target/ARM64/AsmParser/Makefile b/lib/Target/ARM64/AsmParser/Makefile
new file mode 100644
index 0000000..d25c47f
--- /dev/null
+++ b/lib/Target/ARM64/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64AsmParser
+
+# Hack: we need to include 'main' ARM target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/CMakeLists.txt b/lib/Target/ARM64/CMakeLists.txt
new file mode 100644
index 0000000..6de861c
--- /dev/null
+++ b/lib/Target/ARM64/CMakeLists.txt
@@ -0,0 +1,50 @@
+set(LLVM_TARGET_DEFINITIONS ARM64.td)
+
+tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler)
+add_public_tablegen_target(ARM64CommonTableGen)
+
+add_llvm_target(ARM64CodeGen
+  ARM64AddressTypePromotion.cpp
+  ARM64AdvSIMDScalarPass.cpp
+  ARM64AsmPrinter.cpp
+  ARM64BranchRelaxation.cpp
+  ARM64CleanupLocalDynamicTLSPass.cpp
+  ARM64CollectLOH.cpp
+  ARM64ConditionalCompares.cpp
+  ARM64DeadRegisterDefinitionsPass.cpp
+  ARM64ExpandPseudoInsts.cpp
+  ARM64FastISel.cpp
+  ARM64FrameLowering.cpp
+  ARM64ISelDAGToDAG.cpp
+  ARM64ISelLowering.cpp
+  ARM64InstrInfo.cpp
+  ARM64LoadStoreOptimizer.cpp
+  ARM64MCInstLower.cpp
+  ARM64PromoteConstant.cpp
+  ARM64RegisterInfo.cpp
+  ARM64SelectionDAGInfo.cpp
+  ARM64StorePairSuppress.cpp
+  ARM64Subtarget.cpp
+  ARM64TargetMachine.cpp
+  ARM64TargetObjectFile.cpp
+  ARM64TargetTransformInfo.cpp
+)
+
+add_dependencies(LLVMARM64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
new file mode 100644
index 0000000..44c501f
--- /dev/null
+++ b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
@@ -0,0 +1,2142 @@
+//===- ARM64Disassembler.cpp - Disassembler for ARM64 -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm64-disassembler"
+
+#include "ARM64Disassembler.h"
+#include "ARM64Subtarget.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeFixedPointScaleImm(llvm::MCInst &Inst, unsigned Imm,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeCondBranchTarget(llvm::MCInst &Inst, unsigned Imm,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeSystemCPSRInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSIMDLdStPost(llvm::MCInst &Inst, uint32_t insn,
+                                       uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeSIMDLdStSingle(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeSIMDLdStSingleTied(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+
+#include "ARM64GenDisassemblerTables.inc"
+#include "ARM64GenInstrInfo.inc"
+
+using namespace llvm;
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+
+static MCDisassembler *createARM64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI) {
+  return new ARM64Disassembler(STI);
+}
+
+DecodeStatus ARM64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  Size = 0;
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+    return Fail;
+  Size = 4;
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn =
+      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result =
+      decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+  if (!result)
+    return Fail;
+
+  return Success;
+}
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+  switch (LLVMDisassembler_VariantKind) {
+  case LLVMDisassembler_VariantKind_None:
+    return MCSymbolRefExpr::VK_None;
+  case LLVMDisassembler_VariantKind_ARM64_PAGE:
+    return MCSymbolRefExpr::VK_PAGE;
+  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+    return MCSymbolRefExpr::VK_PAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+    return MCSymbolRefExpr::VK_GOTPAGE;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+    return MCSymbolRefExpr::VK_GOTPAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_TLVP:
+  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+  default:
+    assert(0 && "bad LLVMDisassembler_VariantKind");
+    return MCSymbolRefExpr::VK_None;
+  }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail.  If the getOpInfo() function was set as part of the
+/// setupForSymbolicDisassembly() call then that function is called to get any
+/// symbolic information at the Address for this instrution.  If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created.  If getOpInfo()
+/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream.  This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool ARM64Disassembler::tryAddingSymbolicOperand(uint64_t Address, int Value,
+                                                 bool isBranch,
+                                                 uint64_t InstSize, MCInst &MI,
+                                                 uint32_t insn) const {
+  LLVMOpInfoCallback getOpInfo = getLLVMOpInfoCallback();
+
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  void *DisInfo = getDisInfoBlock();
+  uint64_t ReferenceType;
+  const char *ReferenceName;
+  const char *Name;
+  LLVMSymbolLookupCallback SymbolLookUp = getLLVMSymbolLookupCallback();
+  if (!getOpInfo ||
+      !getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (isBranch) {
+      if (SymbolLookUp) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+        Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                            &ReferenceName);
+        if (Name) {
+          SymbolicOp.AddSymbol.Name = Name;
+          SymbolicOp.AddSymbol.Present = Success;
+          SymbolicOp.Value = 0;
+        } else {
+          SymbolicOp.Value = Address + Value;
+        }
+        if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+          (*CommentStream) << "symbol stub for: " << ReferenceName;
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_Objc_Message)
+          (*CommentStream) << "Objc message: " << ReferenceName;
+      } else {
+        return false;
+      }
+    } else if (MI.getOpcode() == ARM64::ADRP) {
+      if (SymbolLookUp) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+        Name = SymbolLookUp(DisInfo, insn, &ReferenceType, Address,
+                            &ReferenceName);
+        (*CommentStream) << format("0x%llx",
+                                   0xfffffffffffff000LL & (Address + Value));
+      } else {
+        return false;
+      }
+    } else if (MI.getOpcode() == ARM64::ADDXri ||
+               MI.getOpcode() == ARM64::LDRXui ||
+               MI.getOpcode() == ARM64::LDRXl || MI.getOpcode() == ARM64::ADR) {
+      if (SymbolLookUp) {
+        if (MI.getOpcode() == ARM64::ADDXri)
+          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+        else if (MI.getOpcode() == ARM64::LDRXui)
+          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+        if (MI.getOpcode() == ARM64::LDRXl) {
+          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+          Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                              &ReferenceName);
+        } else if (MI.getOpcode() == ARM64::ADR) {
+          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+          Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                              &ReferenceName);
+        } else {
+          Name = SymbolLookUp(DisInfo, insn, &ReferenceType, Address,
+                              &ReferenceName);
+        }
+        if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+          (*CommentStream) << "literal pool symbol address: " << ReferenceName;
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+          (*CommentStream) << "literal pool for: \"" << ReferenceName << "\"";
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+          (*CommentStream) << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_Objc_Message)
+          (*CommentStream) << "Objc message: " << ReferenceName;
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+          (*CommentStream) << "Objc message ref: " << ReferenceName;
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+          (*CommentStream) << "Objc selector ref: " << ReferenceName;
+        else if (ReferenceType ==
+                 LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+          (*CommentStream) << "Objc class ref: " << ReferenceName;
+        // For these instructions, the SymbolLookUp() above is just to get the
+        // ReferenceType and ReferenceName.  We want to make sure not to
+        // fall through so we don't build an MCExpr to leave the disassembly
+        // of the immediate values of these instructions to the InstPrinter.
+        return false;
+      } else {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  MCContext *Ctx = getMCContext();
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+      if (Variant != MCSymbolRefExpr::VK_None)
+        Add = MCSymbolRefExpr::Create(Sym, Variant, *Ctx);
+      else
+        Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+
+  return true;
+}
+
+extern "C" void LLVMInitializeARM64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheARM64Target,
+                                         createARM64Disassembler);
+}
+
+static const unsigned FPR128DecoderTable[] = {
+  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
+  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
+  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
+  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
+  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
+  ARM64::Q30, ARM64::Q31
+};
+
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR128DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
+static const unsigned FPR64DecoderTable[] = {
+  ARM64::D0,  ARM64::D1,  ARM64::D2,  ARM64::D3,  ARM64::D4,  ARM64::D5,
+  ARM64::D6,  ARM64::D7,  ARM64::D8,  ARM64::D9,  ARM64::D10, ARM64::D11,
+  ARM64::D12, ARM64::D13, ARM64::D14, ARM64::D15, ARM64::D16, ARM64::D17,
+  ARM64::D18, ARM64::D19, ARM64::D20, ARM64::D21, ARM64::D22, ARM64::D23,
+  ARM64::D24, ARM64::D25, ARM64::D26, ARM64::D27, ARM64::D28, ARM64::D29,
+  ARM64::D30, ARM64::D31
+};
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR32DecoderTable[] = {
+  ARM64::S0,  ARM64::S1,  ARM64::S2,  ARM64::S3,  ARM64::S4,  ARM64::S5,
+  ARM64::S6,  ARM64::S7,  ARM64::S8,  ARM64::S9,  ARM64::S10, ARM64::S11,
+  ARM64::S12, ARM64::S13, ARM64::S14, ARM64::S15, ARM64::S16, ARM64::S17,
+  ARM64::S18, ARM64::S19, ARM64::S20, ARM64::S21, ARM64::S22, ARM64::S23,
+  ARM64::S24, ARM64::S25, ARM64::S26, ARM64::S27, ARM64::S28, ARM64::S29,
+  ARM64::S30, ARM64::S31
+};
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR16DecoderTable[] = {
+  ARM64::H0,  ARM64::H1,  ARM64::H2,  ARM64::H3,  ARM64::H4,  ARM64::H5,
+  ARM64::H6,  ARM64::H7,  ARM64::H8,  ARM64::H9,  ARM64::H10, ARM64::H11,
+  ARM64::H12, ARM64::H13, ARM64::H14, ARM64::H15, ARM64::H16, ARM64::H17,
+  ARM64::H18, ARM64::H19, ARM64::H20, ARM64::H21, ARM64::H22, ARM64::H23,
+  ARM64::H24, ARM64::H25, ARM64::H26, ARM64::H27, ARM64::H28, ARM64::H29,
+  ARM64::H30, ARM64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR16DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR8DecoderTable[] = {
+  ARM64::B0,  ARM64::B1,  ARM64::B2,  ARM64::B3,  ARM64::B4,  ARM64::B5,
+  ARM64::B6,  ARM64::B7,  ARM64::B8,  ARM64::B9,  ARM64::B10, ARM64::B11,
+  ARM64::B12, ARM64::B13, ARM64::B14, ARM64::B15, ARM64::B16, ARM64::B17,
+  ARM64::B18, ARM64::B19, ARM64::B20, ARM64::B21, ARM64::B22, ARM64::B23,
+  ARM64::B24, ARM64::B25, ARM64::B26, ARM64::B27, ARM64::B28, ARM64::B29,
+  ARM64::B30, ARM64::B31
+};
+
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR8DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned GPR64DecoderTable[] = {
+  ARM64::X0,  ARM64::X1,  ARM64::X2,  ARM64::X3,  ARM64::X4,  ARM64::X5,
+  ARM64::X6,  ARM64::X7,  ARM64::X8,  ARM64::X9,  ARM64::X10, ARM64::X11,
+  ARM64::X12, ARM64::X13, ARM64::X14, ARM64::X15, ARM64::X16, ARM64::X17,
+  ARM64::X18, ARM64::X19, ARM64::X20, ARM64::X21, ARM64::X22, ARM64::X23,
+  ARM64::X24, ARM64::X25, ARM64::X26, ARM64::X27, ARM64::X28, ARM64::FP,
+  ARM64::LR,  ARM64::XZR
+};
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == ARM64::XZR)
+    Register = ARM64::SP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned GPR32DecoderTable[] = {
+  ARM64::W0,  ARM64::W1,  ARM64::W2,  ARM64::W3,  ARM64::W4,  ARM64::W5,
+  ARM64::W6,  ARM64::W7,  ARM64::W8,  ARM64::W9,  ARM64::W10, ARM64::W11,
+  ARM64::W12, ARM64::W13, ARM64::W14, ARM64::W15, ARM64::W16, ARM64::W17,
+  ARM64::W18, ARM64::W19, ARM64::W20, ARM64::W21, ARM64::W22, ARM64::W23,
+  ARM64::W24, ARM64::W25, ARM64::W26, ARM64::W27, ARM64::W28, ARM64::W29,
+  ARM64::W30, ARM64::WZR
+};
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == ARM64::WZR)
+    Register = ARM64::WSP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned VectorDecoderTable[] = {
+  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
+  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
+  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
+  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
+  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
+  ARM64::Q30, ARM64::Q31
+};
+
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQDecoderTable[] = {
+  ARM64::Q0_Q1,   ARM64::Q1_Q2,   ARM64::Q2_Q3,   ARM64::Q3_Q4,
+  ARM64::Q4_Q5,   ARM64::Q5_Q6,   ARM64::Q6_Q7,   ARM64::Q7_Q8,
+  ARM64::Q8_Q9,   ARM64::Q9_Q10,  ARM64::Q10_Q11, ARM64::Q11_Q12,
+  ARM64::Q12_Q13, ARM64::Q13_Q14, ARM64::Q14_Q15, ARM64::Q15_Q16,
+  ARM64::Q16_Q17, ARM64::Q17_Q18, ARM64::Q18_Q19, ARM64::Q19_Q20,
+  ARM64::Q20_Q21, ARM64::Q21_Q22, ARM64::Q22_Q23, ARM64::Q23_Q24,
+  ARM64::Q24_Q25, ARM64::Q25_Q26, ARM64::Q26_Q27, ARM64::Q27_Q28,
+  ARM64::Q28_Q29, ARM64::Q29_Q30, ARM64::Q30_Q31, ARM64::Q31_Q0
+};
+
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQQDecoderTable[] = {
+  ARM64::Q0_Q1_Q2,    ARM64::Q1_Q2_Q3,    ARM64::Q2_Q3_Q4,
+  ARM64::Q3_Q4_Q5,    ARM64::Q4_Q5_Q6,    ARM64::Q5_Q6_Q7,
+  ARM64::Q6_Q7_Q8,    ARM64::Q7_Q8_Q9,    ARM64::Q8_Q9_Q10,
+  ARM64::Q9_Q10_Q11,  ARM64::Q10_Q11_Q12, ARM64::Q11_Q12_Q13,
+  ARM64::Q12_Q13_Q14, ARM64::Q13_Q14_Q15, ARM64::Q14_Q15_Q16,
+  ARM64::Q15_Q16_Q17, ARM64::Q16_Q17_Q18, ARM64::Q17_Q18_Q19,
+  ARM64::Q18_Q19_Q20, ARM64::Q19_Q20_Q21, ARM64::Q20_Q21_Q22,
+  ARM64::Q21_Q22_Q23, ARM64::Q22_Q23_Q24, ARM64::Q23_Q24_Q25,
+  ARM64::Q24_Q25_Q26, ARM64::Q25_Q26_Q27, ARM64::Q26_Q27_Q28,
+  ARM64::Q27_Q28_Q29, ARM64::Q28_Q29_Q30, ARM64::Q29_Q30_Q31,
+  ARM64::Q30_Q31_Q0,  ARM64::Q31_Q0_Q1
+};
+
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQQQDecoderTable[] = {
+  ARM64::Q0_Q1_Q2_Q3,     ARM64::Q1_Q2_Q3_Q4,     ARM64::Q2_Q3_Q4_Q5,
+  ARM64::Q3_Q4_Q5_Q6,     ARM64::Q4_Q5_Q6_Q7,     ARM64::Q5_Q6_Q7_Q8,
+  ARM64::Q6_Q7_Q8_Q9,     ARM64::Q7_Q8_Q9_Q10,    ARM64::Q8_Q9_Q10_Q11,
+  ARM64::Q9_Q10_Q11_Q12,  ARM64::Q10_Q11_Q12_Q13, ARM64::Q11_Q12_Q13_Q14,
+  ARM64::Q12_Q13_Q14_Q15, ARM64::Q13_Q14_Q15_Q16, ARM64::Q14_Q15_Q16_Q17,
+  ARM64::Q15_Q16_Q17_Q18, ARM64::Q16_Q17_Q18_Q19, ARM64::Q17_Q18_Q19_Q20,
+  ARM64::Q18_Q19_Q20_Q21, ARM64::Q19_Q20_Q21_Q22, ARM64::Q20_Q21_Q22_Q23,
+  ARM64::Q21_Q22_Q23_Q24, ARM64::Q22_Q23_Q24_Q25, ARM64::Q23_Q24_Q25_Q26,
+  ARM64::Q24_Q25_Q26_Q27, ARM64::Q25_Q26_Q27_Q28, ARM64::Q26_Q27_Q28_Q29,
+  ARM64::Q27_Q28_Q29_Q30, ARM64::Q28_Q29_Q30_Q31, ARM64::Q29_Q30_Q31_Q0,
+  ARM64::Q30_Q31_Q0_Q1,   ARM64::Q31_Q0_Q1_Q2
+};
+
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDecoderTable[] = {
+  ARM64::D0_D1,   ARM64::D1_D2,   ARM64::D2_D3,   ARM64::D3_D4,
+  ARM64::D4_D5,   ARM64::D5_D6,   ARM64::D6_D7,   ARM64::D7_D8,
+  ARM64::D8_D9,   ARM64::D9_D10,  ARM64::D10_D11, ARM64::D11_D12,
+  ARM64::D12_D13, ARM64::D13_D14, ARM64::D14_D15, ARM64::D15_D16,
+  ARM64::D16_D17, ARM64::D17_D18, ARM64::D18_D19, ARM64::D19_D20,
+  ARM64::D20_D21, ARM64::D21_D22, ARM64::D22_D23, ARM64::D23_D24,
+  ARM64::D24_D25, ARM64::D25_D26, ARM64::D26_D27, ARM64::D27_D28,
+  ARM64::D28_D29, ARM64::D29_D30, ARM64::D30_D31, ARM64::D31_D0
+};
+
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDecoderTable[] = {
+  ARM64::D0_D1_D2,    ARM64::D1_D2_D3,    ARM64::D2_D3_D4,
+  ARM64::D3_D4_D5,    ARM64::D4_D5_D6,    ARM64::D5_D6_D7,
+  ARM64::D6_D7_D8,    ARM64::D7_D8_D9,    ARM64::D8_D9_D10,
+  ARM64::D9_D10_D11,  ARM64::D10_D11_D12, ARM64::D11_D12_D13,
+  ARM64::D12_D13_D14, ARM64::D13_D14_D15, ARM64::D14_D15_D16,
+  ARM64::D15_D16_D17, ARM64::D16_D17_D18, ARM64::D17_D18_D19,
+  ARM64::D18_D19_D20, ARM64::D19_D20_D21, ARM64::D20_D21_D22,
+  ARM64::D21_D22_D23, ARM64::D22_D23_D24, ARM64::D23_D24_D25,
+  ARM64::D24_D25_D26, ARM64::D25_D26_D27, ARM64::D26_D27_D28,
+  ARM64::D27_D28_D29, ARM64::D28_D29_D30, ARM64::D29_D30_D31,
+  ARM64::D30_D31_D0,  ARM64::D31_D0_D1
+};
+
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDDecoderTable[] = {
+  ARM64::D0_D1_D2_D3,     ARM64::D1_D2_D3_D4,     ARM64::D2_D3_D4_D5,
+  ARM64::D3_D4_D5_D6,     ARM64::D4_D5_D6_D7,     ARM64::D5_D6_D7_D8,
+  ARM64::D6_D7_D8_D9,     ARM64::D7_D8_D9_D10,    ARM64::D8_D9_D10_D11,
+  ARM64::D9_D10_D11_D12,  ARM64::D10_D11_D12_D13, ARM64::D11_D12_D13_D14,
+  ARM64::D12_D13_D14_D15, ARM64::D13_D14_D15_D16, ARM64::D14_D15_D16_D17,
+  ARM64::D15_D16_D17_D18, ARM64::D16_D17_D18_D19, ARM64::D17_D18_D19_D20,
+  ARM64::D18_D19_D20_D21, ARM64::D19_D20_D21_D22, ARM64::D20_D21_D22_D23,
+  ARM64::D21_D22_D23_D24, ARM64::D22_D23_D24_D25, ARM64::D23_D24_D25_D26,
+  ARM64::D24_D25_D26_D27, ARM64::D25_D26_D27_D28, ARM64::D26_D27_D28_D29,
+  ARM64::D27_D28_D29_D30, ARM64::D28_D29_D30_D31, ARM64::D29_D30_D31_D0,
+  ARM64::D30_D31_D0_D1,   ARM64::D31_D0_D1_D2
+};
+
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm(llvm::MCInst &Inst, unsigned Imm,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeCondBranchTarget(llvm::MCInst &Inst, unsigned Imm,
+                                           uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const ARM64Disassembler *Dis =
+      static_cast<const ARM64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Addr, ImmVal << 2,
+                                     Inst.getOpcode() != ARM64::LDRXl, 4, Inst))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  return Success;
+}
+
+static DecodeStatus DecodeSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(Imm | 0x8000));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
+
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
+}
+
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
+}
+
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::ANDWrs:
+  case ARM64::ANDSWrs:
+  case ARM64::BICWrs:
+  case ARM64::BICSWrs:
+  case ARM64::ORRWrs:
+  case ARM64::ORNWrs:
+  case ARM64::EORWrs:
+  case ARM64::EONWrs:
+  case ARM64::ADDWrs:
+  case ARM64::ADDSWrs:
+  case ARM64::SUBWrs:
+  case ARM64::SUBSWrs: {
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+  case ARM64::ANDXrs:
+  case ARM64::ANDSXrs:
+  case ARM64::BICXrs:
+  case ARM64::BICSXrs:
+  case ARM64::ORRXrs:
+  case ARM64::ORNXrs:
+  case ARM64::EORXrs:
+  case ARM64::EONXrs:
+  case ARM64::ADDXrs:
+  case ARM64::ADDSXrs:
+  case ARM64::SUBXrs:
+  case ARM64::SUBSXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::MOVZWi:
+  case ARM64::MOVNWi:
+  case ARM64::MOVKWi:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::MOVZXi:
+  case ARM64::MOVNXi:
+  case ARM64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
+
+  if (Inst.getOpcode() == ARM64::MOVKWi || Inst.getOpcode() == ARM64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const ARM64Disassembler *Dis =
+      static_cast<const ARM64Disassembler *>(Decoder);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case ARM64::STRBBui:
+  case ARM64::LDRBBui:
+  case ARM64::LDRSBWui:
+  case ARM64::STRHHui:
+  case ARM64::LDRHHui:
+  case ARM64::LDRSHWui:
+  case ARM64::STRWui:
+  case ARM64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRSBXui:
+  case ARM64::LDRSHXui:
+  case ARM64::LDRSWui:
+  case ARM64::STRXui:
+  case ARM64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRQui:
+  case ARM64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRDui:
+  case ARM64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRSui:
+  case ARM64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRHui:
+  case ARM64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRBui:
+  case ARM64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Addr, offset, Fail, 4, Inst, insn))
+    Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
+}
+
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
+
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case ARM64::STURBBi:
+  case ARM64::LDURBBi:
+  case ARM64::LDURSBWi:
+  case ARM64::STURHHi:
+  case ARM64::LDURHHi:
+  case ARM64::LDURSHWi:
+  case ARM64::STURWi:
+  case ARM64::LDURWi:
+  case ARM64::LDTRSBWi:
+  case ARM64::LDTRSHWi:
+  case ARM64::STTRWi:
+  case ARM64::LDTRWi:
+  case ARM64::STTRHi:
+  case ARM64::LDTRHi:
+  case ARM64::LDTRBi:
+  case ARM64::STTRBi:
+  case ARM64::LDRSBWpre:
+  case ARM64::LDRSHWpre:
+  case ARM64::STRBBpre:
+  case ARM64::LDRBBpre:
+  case ARM64::STRHHpre:
+  case ARM64::LDRHHpre:
+  case ARM64::STRWpre:
+  case ARM64::LDRWpre:
+  case ARM64::LDRSBWpost:
+  case ARM64::LDRSHWpost:
+  case ARM64::STRBBpost:
+  case ARM64::LDRBBpost:
+  case ARM64::STRHHpost:
+  case ARM64::LDRHHpost:
+  case ARM64::STRWpost:
+  case ARM64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDURSBXi:
+  case ARM64::LDURSHXi:
+  case ARM64::LDURSWi:
+  case ARM64::STURXi:
+  case ARM64::LDURXi:
+  case ARM64::LDTRSBXi:
+  case ARM64::LDTRSHXi:
+  case ARM64::LDTRSWi:
+  case ARM64::STTRXi:
+  case ARM64::LDTRXi:
+  case ARM64::LDRSBXpre:
+  case ARM64::LDRSHXpre:
+  case ARM64::STRXpre:
+  case ARM64::LDRSWpre:
+  case ARM64::LDRXpre:
+  case ARM64::LDRSBXpost:
+  case ARM64::LDRSHXpost:
+  case ARM64::STRXpost:
+  case ARM64::LDRSWpost:
+  case ARM64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDURQi:
+  case ARM64::STURQi:
+  case ARM64::LDRQpre:
+  case ARM64::STRQpre:
+  case ARM64::LDRQpost:
+  case ARM64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDURDi:
+  case ARM64::STURDi:
+  case ARM64::LDRDpre:
+  case ARM64::STRDpre:
+  case ARM64::LDRDpost:
+  case ARM64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDURSi:
+  case ARM64::STURSi:
+  case ARM64::LDRSpre:
+  case ARM64::STRSpre:
+  case ARM64::LDRSpost:
+  case ARM64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDURHi:
+  case ARM64::STURHi:
+  case ARM64::LDRHpre:
+  case ARM64::STRHpre:
+  case ARM64::LDRHpost:
+  case ARM64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDURBi:
+  case ARM64::STURBi:
+  case ARM64::LDRBpre:
+  case ARM64::STRBpre:
+  case ARM64::LDRBpost:
+  case ARM64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
+}
+
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::STLXRW:
+  case ARM64::STLXRB:
+  case ARM64::STLXRH:
+  case ARM64::STXRW:
+  case ARM64::STXRB:
+  case ARM64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case ARM64::LDARW:
+  case ARM64::LDARB:
+  case ARM64::LDARH:
+  case ARM64::LDAXRW:
+  case ARM64::LDAXRB:
+  case ARM64::LDAXRH:
+  case ARM64::LDXRW:
+  case ARM64::LDXRB:
+  case ARM64::LDXRH:
+  case ARM64::STLRW:
+  case ARM64::STLRB:
+  case ARM64::STLRH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::STLXRX:
+  case ARM64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case ARM64::LDARX:
+  case ARM64::LDAXRX:
+  case ARM64::LDXRX:
+  case ARM64::STLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::STLXPW:
+  case ARM64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case ARM64::LDAXPW:
+  case ARM64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case ARM64::STLXPX:
+  case ARM64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case ARM64::LDAXPX:
+  case ARM64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  return Success;
+}
+
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::LDNPXi:
+  case ARM64::STNPXi:
+  case ARM64::LDPXpost:
+  case ARM64::STPXpost:
+  case ARM64::LDPSWpost:
+  case ARM64::LDPXi:
+  case ARM64::STPXi:
+  case ARM64::LDPSWi:
+  case ARM64::LDPXpre:
+  case ARM64::STPXpre:
+  case ARM64::LDPSWpre:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case ARM64::LDNPWi:
+  case ARM64::STNPWi:
+  case ARM64::LDPWpost:
+  case ARM64::STPWpost:
+  case ARM64::LDPWi:
+  case ARM64::STPWi:
+  case ARM64::LDPWpre:
+  case ARM64::STPWpre:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case ARM64::LDNPQi:
+  case ARM64::STNPQi:
+  case ARM64::LDPQpost:
+  case ARM64::STPQpost:
+  case ARM64::LDPQi:
+  case ARM64::STPQi:
+  case ARM64::LDPQpre:
+  case ARM64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case ARM64::LDNPDi:
+  case ARM64::STNPDi:
+  case ARM64::LDPDpost:
+  case ARM64::STPDpost:
+  case ARM64::LDPDi:
+  case ARM64::STPDi:
+  case ARM64::LDPDpre:
+  case ARM64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case ARM64::LDNPSi:
+  case ARM64::STNPSi:
+  case ARM64::LDPSpost:
+  case ARM64::STPSpost:
+  case ARM64::LDPSi:
+  case ARM64::STPSi:
+  case ARM64::LDPSpre:
+  case ARM64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
+}
+
+static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extendHi = fieldFromInstruction(insn, 13, 3);
+  unsigned extendLo = fieldFromInstruction(insn, 12, 1);
+  unsigned extend = 0;
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::LDRSWro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRXro:
+  case ARM64::STRXro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRWro:
+  case ARM64::STRWro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRQro:
+  case ARM64::STRQro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRDro:
+  case ARM64::STRDro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRSro:
+  case ARM64::STRSro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRHro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRBro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRBBro:
+  case ARM64::STRBBro:
+  case ARM64::LDRSBWro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRHHro:
+  case ARM64::STRHHro:
+  case ARM64::LDRSHWro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRSHXro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LDRSBXro:
+    extend = (extendHi << 1) | extendLo;
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::PRFMro:
+    extend = (extendHi << 1) | extendLo;
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+
+  if (extendHi == 0x3)
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
+}
+
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::ADDWrx:
+  case ARM64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case ARM64::ADDSWrx:
+  case ARM64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case ARM64::ADDXrx:
+  case ARM64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case ARM64::ADDSXrx:
+  case ARM64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case ARM64::ADDXrx64:
+  case ARM64::ADDSXrx64:
+  case ARM64::SUBXrx64:
+  case ARM64::SUBSXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
+}
+
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return Success;
+}
+
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  if (Inst.getOpcode() == ARM64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  else
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case ARM64::MOVIv4i16:
+  case ARM64::MOVIv8i16:
+  case ARM64::MVNIv4i16:
+  case ARM64::MVNIv8i16:
+  case ARM64::MOVIv2i32:
+  case ARM64::MOVIv4i32:
+  case ARM64::MVNIv2i32:
+  case ARM64::MVNIv4i32:
+    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+    break;
+  case ARM64::MOVIv2s_msl:
+  case ARM64::MOVIv4s_msl:
+  case ARM64::MVNIv2s_msl:
+  case ARM64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
+    break;
+  }
+
+  return Success;
+}
+
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const ARM64Disassembler *Dis =
+      static_cast<const ARM64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Addr, imm, Fail, 4, Inst, insn))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const ARM64Disassembler *Dis =
+      static_cast<const ARM64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Addr, ImmVal, Fail, 4, Inst, insn))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const ARM64Disassembler *Dis =
+      static_cast<const ARM64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Addr, imm << 2, true, 4, Inst))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemCPSRInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  Inst.addOperand(MCOperand::CreateImm((op1 << 3) | op2));
+  Inst.addOperand(MCOperand::CreateImm(crm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const ARM64Disassembler *Dis =
+      static_cast<const ARM64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Addr, dst << 2, true, 4, Inst))
+    Inst.addOperand(MCOperand::CreateImm(dst));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSIMDLdStPost(llvm::MCInst &Inst, uint32_t insn,
+                                       uint64_t Addr, const void *Decoder) {
+  uint64_t Rd = fieldFromInstruction(insn, 0, 5);
+  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
+  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::ST1Onev8b_POST:
+  case ARM64::ST1Onev4h_POST:
+  case ARM64::ST1Onev2s_POST:
+  case ARM64::ST1Onev1d_POST:
+  case ARM64::LD1Onev8b_POST:
+  case ARM64::LD1Onev4h_POST:
+  case ARM64::LD1Onev2s_POST:
+  case ARM64::LD1Onev1d_POST:
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Onev16b_POST:
+  case ARM64::ST1Onev8h_POST:
+  case ARM64::ST1Onev4s_POST:
+  case ARM64::ST1Onev2d_POST:
+  case ARM64::LD1Onev16b_POST:
+  case ARM64::LD1Onev8h_POST:
+  case ARM64::LD1Onev4s_POST:
+  case ARM64::LD1Onev2d_POST:
+    DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Twov8b_POST:
+  case ARM64::ST1Twov4h_POST:
+  case ARM64::ST1Twov2s_POST:
+  case ARM64::ST1Twov1d_POST:
+  case ARM64::ST2Twov8b_POST:
+  case ARM64::ST2Twov4h_POST:
+  case ARM64::ST2Twov2s_POST:
+  case ARM64::LD1Twov8b_POST:
+  case ARM64::LD1Twov4h_POST:
+  case ARM64::LD1Twov2s_POST:
+  case ARM64::LD1Twov1d_POST:
+  case ARM64::LD2Twov8b_POST:
+  case ARM64::LD2Twov4h_POST:
+  case ARM64::LD2Twov2s_POST:
+    DecodeDDRegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Threev8b_POST:
+  case ARM64::ST1Threev4h_POST:
+  case ARM64::ST1Threev2s_POST:
+  case ARM64::ST1Threev1d_POST:
+  case ARM64::ST3Threev8b_POST:
+  case ARM64::ST3Threev4h_POST:
+  case ARM64::ST3Threev2s_POST:
+  case ARM64::LD1Threev8b_POST:
+  case ARM64::LD1Threev4h_POST:
+  case ARM64::LD1Threev2s_POST:
+  case ARM64::LD1Threev1d_POST:
+  case ARM64::LD3Threev8b_POST:
+  case ARM64::LD3Threev4h_POST:
+  case ARM64::LD3Threev2s_POST:
+    DecodeDDDRegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Fourv8b_POST:
+  case ARM64::ST1Fourv4h_POST:
+  case ARM64::ST1Fourv2s_POST:
+  case ARM64::ST1Fourv1d_POST:
+  case ARM64::ST4Fourv8b_POST:
+  case ARM64::ST4Fourv4h_POST:
+  case ARM64::ST4Fourv2s_POST:
+  case ARM64::LD1Fourv8b_POST:
+  case ARM64::LD1Fourv4h_POST:
+  case ARM64::LD1Fourv2s_POST:
+  case ARM64::LD1Fourv1d_POST:
+  case ARM64::LD4Fourv8b_POST:
+  case ARM64::LD4Fourv4h_POST:
+  case ARM64::LD4Fourv2s_POST:
+    DecodeDDDDRegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Twov16b_POST:
+  case ARM64::ST1Twov8h_POST:
+  case ARM64::ST1Twov4s_POST:
+  case ARM64::ST1Twov2d_POST:
+  case ARM64::ST2Twov16b_POST:
+  case ARM64::ST2Twov8h_POST:
+  case ARM64::ST2Twov4s_POST:
+  case ARM64::ST2Twov2d_POST:
+  case ARM64::LD1Twov16b_POST:
+  case ARM64::LD1Twov8h_POST:
+  case ARM64::LD1Twov4s_POST:
+  case ARM64::LD1Twov2d_POST:
+  case ARM64::LD2Twov16b_POST:
+  case ARM64::LD2Twov8h_POST:
+  case ARM64::LD2Twov4s_POST:
+  case ARM64::LD2Twov2d_POST:
+    DecodeQQRegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Threev16b_POST:
+  case ARM64::ST1Threev8h_POST:
+  case ARM64::ST1Threev4s_POST:
+  case ARM64::ST1Threev2d_POST:
+  case ARM64::ST3Threev16b_POST:
+  case ARM64::ST3Threev8h_POST:
+  case ARM64::ST3Threev4s_POST:
+  case ARM64::ST3Threev2d_POST:
+  case ARM64::LD1Threev16b_POST:
+  case ARM64::LD1Threev8h_POST:
+  case ARM64::LD1Threev4s_POST:
+  case ARM64::LD1Threev2d_POST:
+  case ARM64::LD3Threev16b_POST:
+  case ARM64::LD3Threev8h_POST:
+  case ARM64::LD3Threev4s_POST:
+  case ARM64::LD3Threev2d_POST:
+    DecodeQQQRegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case ARM64::ST1Fourv16b_POST:
+  case ARM64::ST1Fourv8h_POST:
+  case ARM64::ST1Fourv4s_POST:
+  case ARM64::ST1Fourv2d_POST:
+  case ARM64::ST4Fourv16b_POST:
+  case ARM64::ST4Fourv8h_POST:
+  case ARM64::ST4Fourv4s_POST:
+  case ARM64::ST4Fourv2d_POST:
+  case ARM64::LD1Fourv16b_POST:
+  case ARM64::LD1Fourv8h_POST:
+  case ARM64::LD1Fourv4s_POST:
+  case ARM64::LD1Fourv2d_POST:
+  case ARM64::LD4Fourv16b_POST:
+  case ARM64::LD4Fourv8h_POST:
+  case ARM64::LD4Fourv4s_POST:
+  case ARM64::LD4Fourv2d_POST:
+    DecodeQQQQRegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+  return Success;
+}
+
+static DecodeStatus DecodeSIMDLdStSingle(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
+  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
+  uint64_t size = fieldFromInstruction(insn, 10, 2);
+  uint64_t S = fieldFromInstruction(insn, 12, 1);
+  uint64_t Q = fieldFromInstruction(insn, 30, 1);
+  uint64_t index = 0;
+
+  switch (Inst.getOpcode()) {
+  case ARM64::ST1i8:
+  case ARM64::ST1i8_POST:
+  case ARM64::ST2i8:
+  case ARM64::ST2i8_POST:
+  case ARM64::ST3i8_POST:
+  case ARM64::ST3i8:
+  case ARM64::ST4i8_POST:
+  case ARM64::ST4i8:
+    index = (Q << 3) | (S << 2) | size;
+    break;
+  case ARM64::ST1i16:
+  case ARM64::ST1i16_POST:
+  case ARM64::ST2i16:
+  case ARM64::ST2i16_POST:
+  case ARM64::ST3i16_POST:
+  case ARM64::ST3i16:
+  case ARM64::ST4i16_POST:
+  case ARM64::ST4i16:
+    index = (Q << 2) | (S << 1) | (size >> 1);
+    break;
+  case ARM64::ST1i32:
+  case ARM64::ST1i32_POST:
+  case ARM64::ST2i32:
+  case ARM64::ST2i32_POST:
+  case ARM64::ST3i32_POST:
+  case ARM64::ST3i32:
+  case ARM64::ST4i32_POST:
+  case ARM64::ST4i32:
+    index = (Q << 1) | S;
+    break;
+  case ARM64::ST1i64:
+  case ARM64::ST1i64_POST:
+  case ARM64::ST2i64:
+  case ARM64::ST2i64_POST:
+  case ARM64::ST3i64_POST:
+  case ARM64::ST3i64:
+  case ARM64::ST4i64_POST:
+  case ARM64::ST4i64:
+    index = Q;
+    break;
+  }
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::LD1Rv8b:
+  case ARM64::LD1Rv8b_POST:
+  case ARM64::LD1Rv4h:
+  case ARM64::LD1Rv4h_POST:
+  case ARM64::LD1Rv2s:
+  case ARM64::LD1Rv2s_POST:
+  case ARM64::LD1Rv1d:
+  case ARM64::LD1Rv1d_POST:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD1Rv16b:
+  case ARM64::LD1Rv16b_POST:
+  case ARM64::LD1Rv8h:
+  case ARM64::LD1Rv8h_POST:
+  case ARM64::LD1Rv4s:
+  case ARM64::LD1Rv4s_POST:
+  case ARM64::LD1Rv2d:
+  case ARM64::LD1Rv2d_POST:
+  case ARM64::ST1i8:
+  case ARM64::ST1i8_POST:
+  case ARM64::ST1i16:
+  case ARM64::ST1i16_POST:
+  case ARM64::ST1i32:
+  case ARM64::ST1i32_POST:
+  case ARM64::ST1i64:
+  case ARM64::ST1i64_POST:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD2Rv16b:
+  case ARM64::LD2Rv16b_POST:
+  case ARM64::LD2Rv8h:
+  case ARM64::LD2Rv8h_POST:
+  case ARM64::LD2Rv4s:
+  case ARM64::LD2Rv4s_POST:
+  case ARM64::LD2Rv2d:
+  case ARM64::LD2Rv2d_POST:
+  case ARM64::ST2i8:
+  case ARM64::ST2i8_POST:
+  case ARM64::ST2i16:
+  case ARM64::ST2i16_POST:
+  case ARM64::ST2i32:
+  case ARM64::ST2i32_POST:
+  case ARM64::ST2i64:
+  case ARM64::ST2i64_POST:
+    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD2Rv8b:
+  case ARM64::LD2Rv8b_POST:
+  case ARM64::LD2Rv4h:
+  case ARM64::LD2Rv4h_POST:
+  case ARM64::LD2Rv2s:
+  case ARM64::LD2Rv2s_POST:
+  case ARM64::LD2Rv1d:
+  case ARM64::LD2Rv1d_POST:
+    DecodeDDRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD3Rv8b:
+  case ARM64::LD3Rv8b_POST:
+  case ARM64::LD3Rv4h:
+  case ARM64::LD3Rv4h_POST:
+  case ARM64::LD3Rv2s:
+  case ARM64::LD3Rv2s_POST:
+  case ARM64::LD3Rv1d:
+  case ARM64::LD3Rv1d_POST:
+    DecodeDDDRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD3Rv16b:
+  case ARM64::LD3Rv16b_POST:
+  case ARM64::LD3Rv8h:
+  case ARM64::LD3Rv8h_POST:
+  case ARM64::LD3Rv4s:
+  case ARM64::LD3Rv4s_POST:
+  case ARM64::LD3Rv2d:
+  case ARM64::LD3Rv2d_POST:
+  case ARM64::ST3i8:
+  case ARM64::ST3i8_POST:
+  case ARM64::ST3i16:
+  case ARM64::ST3i16_POST:
+  case ARM64::ST3i32:
+  case ARM64::ST3i32_POST:
+  case ARM64::ST3i64:
+  case ARM64::ST3i64_POST:
+    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD4Rv8b:
+  case ARM64::LD4Rv8b_POST:
+  case ARM64::LD4Rv4h:
+  case ARM64::LD4Rv4h_POST:
+  case ARM64::LD4Rv2s:
+  case ARM64::LD4Rv2s_POST:
+  case ARM64::LD4Rv1d:
+  case ARM64::LD4Rv1d_POST:
+    DecodeDDDDRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD4Rv16b:
+  case ARM64::LD4Rv16b_POST:
+  case ARM64::LD4Rv8h:
+  case ARM64::LD4Rv8h_POST:
+  case ARM64::LD4Rv4s:
+  case ARM64::LD4Rv4s_POST:
+  case ARM64::LD4Rv2d:
+  case ARM64::LD4Rv2d_POST:
+  case ARM64::ST4i8:
+  case ARM64::ST4i8_POST:
+  case ARM64::ST4i16:
+  case ARM64::ST4i16_POST:
+  case ARM64::ST4i32:
+  case ARM64::ST4i32_POST:
+  case ARM64::ST4i64:
+  case ARM64::ST4i64_POST:
+    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM64::LD1Rv8b:
+  case ARM64::LD1Rv8b_POST:
+  case ARM64::LD1Rv16b:
+  case ARM64::LD1Rv16b_POST:
+  case ARM64::LD1Rv4h:
+  case ARM64::LD1Rv4h_POST:
+  case ARM64::LD1Rv8h:
+  case ARM64::LD1Rv8h_POST:
+  case ARM64::LD1Rv4s:
+  case ARM64::LD1Rv4s_POST:
+  case ARM64::LD1Rv2s:
+  case ARM64::LD1Rv2s_POST:
+  case ARM64::LD1Rv1d:
+  case ARM64::LD1Rv1d_POST:
+  case ARM64::LD1Rv2d:
+  case ARM64::LD1Rv2d_POST:
+  case ARM64::LD2Rv8b:
+  case ARM64::LD2Rv8b_POST:
+  case ARM64::LD2Rv16b:
+  case ARM64::LD2Rv16b_POST:
+  case ARM64::LD2Rv4h:
+  case ARM64::LD2Rv4h_POST:
+  case ARM64::LD2Rv8h:
+  case ARM64::LD2Rv8h_POST:
+  case ARM64::LD2Rv2s:
+  case ARM64::LD2Rv2s_POST:
+  case ARM64::LD2Rv4s:
+  case ARM64::LD2Rv4s_POST:
+  case ARM64::LD2Rv2d:
+  case ARM64::LD2Rv2d_POST:
+  case ARM64::LD2Rv1d:
+  case ARM64::LD2Rv1d_POST:
+  case ARM64::LD3Rv8b:
+  case ARM64::LD3Rv8b_POST:
+  case ARM64::LD3Rv16b:
+  case ARM64::LD3Rv16b_POST:
+  case ARM64::LD3Rv4h:
+  case ARM64::LD3Rv4h_POST:
+  case ARM64::LD3Rv8h:
+  case ARM64::LD3Rv8h_POST:
+  case ARM64::LD3Rv2s:
+  case ARM64::LD3Rv2s_POST:
+  case ARM64::LD3Rv4s:
+  case ARM64::LD3Rv4s_POST:
+  case ARM64::LD3Rv2d:
+  case ARM64::LD3Rv2d_POST:
+  case ARM64::LD3Rv1d:
+  case ARM64::LD3Rv1d_POST:
+  case ARM64::LD4Rv8b:
+  case ARM64::LD4Rv8b_POST:
+  case ARM64::LD4Rv16b:
+  case ARM64::LD4Rv16b_POST:
+  case ARM64::LD4Rv4h:
+  case ARM64::LD4Rv4h_POST:
+  case ARM64::LD4Rv8h:
+  case ARM64::LD4Rv8h_POST:
+  case ARM64::LD4Rv2s:
+  case ARM64::LD4Rv2s_POST:
+  case ARM64::LD4Rv4s:
+  case ARM64::LD4Rv4s_POST:
+  case ARM64::LD4Rv2d:
+  case ARM64::LD4Rv2d_POST:
+  case ARM64::LD4Rv1d:
+  case ARM64::LD4Rv1d_POST:
+    break;
+  default:
+    Inst.addOperand(MCOperand::CreateImm(index));
+  }
+
+  DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+
+  switch (Inst.getOpcode()) {
+  case ARM64::ST1i8_POST:
+  case ARM64::ST1i16_POST:
+  case ARM64::ST1i32_POST:
+  case ARM64::ST1i64_POST:
+  case ARM64::LD1Rv8b_POST:
+  case ARM64::LD1Rv16b_POST:
+  case ARM64::LD1Rv4h_POST:
+  case ARM64::LD1Rv8h_POST:
+  case ARM64::LD1Rv2s_POST:
+  case ARM64::LD1Rv4s_POST:
+  case ARM64::LD1Rv1d_POST:
+  case ARM64::LD1Rv2d_POST:
+  case ARM64::ST2i8_POST:
+  case ARM64::ST2i16_POST:
+  case ARM64::ST2i32_POST:
+  case ARM64::ST2i64_POST:
+  case ARM64::LD2Rv8b_POST:
+  case ARM64::LD2Rv16b_POST:
+  case ARM64::LD2Rv4h_POST:
+  case ARM64::LD2Rv8h_POST:
+  case ARM64::LD2Rv2s_POST:
+  case ARM64::LD2Rv4s_POST:
+  case ARM64::LD2Rv2d_POST:
+  case ARM64::LD2Rv1d_POST:
+  case ARM64::ST3i8_POST:
+  case ARM64::ST3i16_POST:
+  case ARM64::ST3i32_POST:
+  case ARM64::ST3i64_POST:
+  case ARM64::LD3Rv8b_POST:
+  case ARM64::LD3Rv16b_POST:
+  case ARM64::LD3Rv4h_POST:
+  case ARM64::LD3Rv8h_POST:
+  case ARM64::LD3Rv2s_POST:
+  case ARM64::LD3Rv4s_POST:
+  case ARM64::LD3Rv2d_POST:
+  case ARM64::LD3Rv1d_POST:
+  case ARM64::ST4i8_POST:
+  case ARM64::ST4i16_POST:
+  case ARM64::ST4i32_POST:
+  case ARM64::ST4i64_POST:
+  case ARM64::LD4Rv8b_POST:
+  case ARM64::LD4Rv16b_POST:
+  case ARM64::LD4Rv4h_POST:
+  case ARM64::LD4Rv8h_POST:
+  case ARM64::LD4Rv2s_POST:
+  case ARM64::LD4Rv4s_POST:
+  case ARM64::LD4Rv2d_POST:
+  case ARM64::LD4Rv1d_POST:
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+  return Success;
+}
+
+static DecodeStatus DecodeSIMDLdStSingleTied(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
+  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
+  uint64_t size = fieldFromInstruction(insn, 10, 2);
+  uint64_t S = fieldFromInstruction(insn, 12, 1);
+  uint64_t Q = fieldFromInstruction(insn, 30, 1);
+  uint64_t index = 0;
+
+  switch (Inst.getOpcode()) {
+  case ARM64::LD1i8:
+  case ARM64::LD1i8_POST:
+  case ARM64::LD2i8:
+  case ARM64::LD2i8_POST:
+  case ARM64::LD3i8_POST:
+  case ARM64::LD3i8:
+  case ARM64::LD4i8_POST:
+  case ARM64::LD4i8:
+    index = (Q << 3) | (S << 2) | size;
+    break;
+  case ARM64::LD1i16:
+  case ARM64::LD1i16_POST:
+  case ARM64::LD2i16:
+  case ARM64::LD2i16_POST:
+  case ARM64::LD3i16_POST:
+  case ARM64::LD3i16:
+  case ARM64::LD4i16_POST:
+  case ARM64::LD4i16:
+    index = (Q << 2) | (S << 1) | (size >> 1);
+    break;
+  case ARM64::LD1i32:
+  case ARM64::LD1i32_POST:
+  case ARM64::LD2i32:
+  case ARM64::LD2i32_POST:
+  case ARM64::LD3i32_POST:
+  case ARM64::LD3i32:
+  case ARM64::LD4i32_POST:
+  case ARM64::LD4i32:
+    index = (Q << 1) | S;
+    break;
+  case ARM64::LD1i64:
+  case ARM64::LD1i64_POST:
+  case ARM64::LD2i64:
+  case ARM64::LD2i64_POST:
+  case ARM64::LD3i64_POST:
+  case ARM64::LD3i64:
+  case ARM64::LD4i64_POST:
+  case ARM64::LD4i64:
+    index = Q;
+    break;
+  }
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case ARM64::LD1i8:
+  case ARM64::LD1i8_POST:
+  case ARM64::LD1i16:
+  case ARM64::LD1i16_POST:
+  case ARM64::LD1i32:
+  case ARM64::LD1i32_POST:
+  case ARM64::LD1i64:
+  case ARM64::LD1i64_POST:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD2i8:
+  case ARM64::LD2i8_POST:
+  case ARM64::LD2i16:
+  case ARM64::LD2i16_POST:
+  case ARM64::LD2i32:
+  case ARM64::LD2i32_POST:
+  case ARM64::LD2i64:
+  case ARM64::LD2i64_POST:
+    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD3i8:
+  case ARM64::LD3i8_POST:
+  case ARM64::LD3i16:
+  case ARM64::LD3i16_POST:
+  case ARM64::LD3i32:
+  case ARM64::LD3i32_POST:
+  case ARM64::LD3i64:
+  case ARM64::LD3i64_POST:
+    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case ARM64::LD4i8:
+  case ARM64::LD4i8_POST:
+  case ARM64::LD4i16:
+  case ARM64::LD4i16_POST:
+  case ARM64::LD4i32:
+  case ARM64::LD4i32_POST:
+  case ARM64::LD4i64:
+  case ARM64::LD4i64_POST:
+    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(index));
+  DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+
+  switch (Inst.getOpcode()) {
+  case ARM64::LD1i8_POST:
+  case ARM64::LD1i16_POST:
+  case ARM64::LD1i32_POST:
+  case ARM64::LD1i64_POST:
+  case ARM64::LD2i8_POST:
+  case ARM64::LD2i16_POST:
+  case ARM64::LD2i32_POST:
+  case ARM64::LD2i64_POST:
+  case ARM64::LD3i8_POST:
+  case ARM64::LD3i16_POST:
+  case ARM64::LD3i32_POST:
+  case ARM64::LD3i64_POST:
+  case ARM64::LD4i8_POST:
+  case ARM64::LD4i16_POST:
+  case ARM64::LD4i32_POST:
+  case ARM64::LD4i64_POST:
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+  return Success;
+}
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
new file mode 100644
index 0000000..35efc8d
--- /dev/null
+++ b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
@@ -0,0 +1,54 @@
+//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64DISASSEMBLER_H
+#define ARM64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class ARM64Disassembler : public MCDisassembler {
+public:
+  ARM64Disassembler(const MCSubtargetInfo &STI) : MCDisassembler(STI) {}
+
+  ~ARM64Disassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                                              const MemoryObject &region,
+                                              uint64_t address,
+                                              raw_ostream &vStream,
+                                              raw_ostream &cStream) const;
+
+  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+  /// operand in place of the immediate Value in the MCInst.  The immediate
+  /// Value has not had any PC adjustment made by the caller. If the instruction
+  /// adds the PC to the immediate Value then InstsAddsAddressToValue is true,
+  /// else false.  If the getOpInfo() function was set as part of the
+  /// setupForSymbolicDisassembly() call then that function is called to get any
+  /// symbolic information at the Address for this instrution.  If that returns
+  /// non-zero then the symbolic information it returns is used to create an
+  /// MCExpr and that is added as an operand to the MCInst.  This function
+  /// returns true if it adds an operand to the MCInst and false otherwise.
+  bool tryAddingSymbolicOperand(uint64_t Address, int Value,
+                                bool InstsAddsAddressToValue, uint64_t InstSize,
+                                MCInst &MI, uint32_t insn = 0) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/Disassembler/CMakeLists.txt b/lib/Target/ARM64/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..ad998c2
--- /dev/null
+++ b/lib/Target/ARM64/Disassembler/CMakeLists.txt
@@ -0,0 +1,13 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64Disassembler
+  ARM64Disassembler.cpp
+  )
+# workaround for hanging compilation on MSVC8, 9 and 10
+#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+#set_property(
+#  SOURCE ARMDisassembler.cpp
+#  PROPERTY COMPILE_FLAGS "/Od"
+#  )
+#endif()
+add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/Disassembler/LLVMBuild.txt b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..5935ee6
--- /dev/null
+++ b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Disassembler
+parent = ARM64
+required_libraries = ARM64Desc ARM64Info MC Support
+add_to_library_groups = ARM64
+
diff --git a/lib/Target/ARM64/Disassembler/Makefile b/lib/Target/ARM64/Disassembler/Makefile
new file mode 100644
index 0000000..479d00c
--- /dev/null
+++ b/lib/Target/ARM64/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Disassembler
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
new file mode 100644
index 0000000..bb90707
--- /dev/null
+++ b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
@@ -0,0 +1,1428 @@
+//===-- ARM64InstPrinter.cpp - Convert ARM64 MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM64InstPrinter.h"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "ARM64GenAsmWriter.inc"
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "ARM64GenAsmWriter1.inc"
+
+ARM64InstPrinter::ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                                   const MCRegisterInfo &MRI,
+                                   const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+ARM64AppleInstPrinter::ARM64AppleInstPrinter(const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI)
+    : ARM64InstPrinter(MAI, MII, MRI, STI) {}
+
+void ARM64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
+  OS << getRegisterName(RegNo);
+}
+
+void ARM64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                 StringRef Annot) {
+  // Check for special encodings and print the cannonical alias instead.
+
+  unsigned Opcode = MI->getOpcode();
+
+  if (Opcode == ARM64::SYS || Opcode == ARM64::SYSxt)
+    if (printSysAlias(MI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+  // TBZ/TBNZ should print the register operand as a Wreg if the bit
+  // number is < 32.
+  if ((Opcode == ARM64::TBNZ || Opcode == ARM64::TBZ) &&
+      MI->getOperand(1).getImm() < 32) {
+    MCInst newMI = *MI;
+    unsigned Reg = MI->getOperand(0).getReg();
+    newMI.getOperand(0).setReg(getWRegFromXReg(Reg));
+    printInstruction(&newMI, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri ||
+      Opcode == ARM64::UBFMXri || Opcode == ARM64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      bool IsSigned = (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri);
+      const char *AsmMnemonic = 0;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        AsmMnemonic = IsSigned ? "sxtb" : "uxtb";
+        break;
+      case 15:
+        AsmMnemonic = IsSigned ? "sxth" : "uxth";
+        break;
+      case 31:
+        AsmMnemonic = IsSigned ? "sxtw" : "uxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg());
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = 0;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == ARM64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == ARM64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == ARM64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == ARM64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == ARM64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == ARM64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+  }
+
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi ||
+       Opcode == ARM64::MOVNXi || Opcode == ARM64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
+
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(1).getExpr();
+    return;
+  }
+
+  if ((Opcode == ARM64::MOVKXi || Opcode == ARM64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(2).getExpr();
+    return;
+  }
+
+  // ANDS WZR, Wn, #imm ==> TST Wn, #imm
+  // ANDS XZR, Xn, #imm ==> TST Xn, #imm
+  if (Opcode == ARM64::ANDSWri && MI->getOperand(0).getReg() == ARM64::WZR) {
+    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printLogicalImm32(MI, 2, O);
+    return;
+  }
+  if (Opcode == ARM64::ANDSXri && MI->getOperand(0).getReg() == ARM64::XZR) {
+    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printLogicalImm64(MI, 2, O);
+    return;
+  }
+  // ANDS WZR, Wn, Wm{, lshift #imm} ==> TST Wn{, lshift #imm}
+  // ANDS XZR, Xn, Xm{, lshift #imm} ==> TST Xn{, lshift #imm}
+  if ((Opcode == ARM64::ANDSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
+      (Opcode == ARM64::ANDSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
+    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printShiftedRegister(MI, 2, O);
+    return;
+  }
+
+  // SUBS WZR, Wn, #imm ==> CMP Wn, #imm
+  // SUBS XZR, Xn, #imm ==> CMP Xn, #imm
+  if ((Opcode == ARM64::SUBSWri && MI->getOperand(0).getReg() == ARM64::WZR) ||
+      (Opcode == ARM64::SUBSXri && MI->getOperand(0).getReg() == ARM64::XZR)) {
+    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printAddSubImm(MI, 2, O);
+    return;
+  }
+  // SUBS WZR, Wn, Wm{, lshift #imm} ==> CMP Wn, Wm{, lshift #imm}
+  // SUBS XZR, Xn, Xm{, lshift #imm} ==> CMP Xn, Xm{, lshift #imm}
+  if ((Opcode == ARM64::SUBSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
+      (Opcode == ARM64::SUBSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
+    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printShiftedRegister(MI, 2, O);
+    return;
+  }
+  // SUBS XZR, Xn, Wm, uxtb #imm ==> CMP Xn, uxtb #imm
+  // SUBS WZR, Wn, Xm, uxtb #imm ==> CMP Wn, uxtb #imm
+  if ((Opcode == ARM64::SUBSXrx && MI->getOperand(0).getReg() == ARM64::XZR) ||
+      (Opcode == ARM64::SUBSWrx && MI->getOperand(0).getReg() == ARM64::WZR)) {
+    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printExtendedRegister(MI, 2, O);
+    return;
+  }
+  // SUBS XZR, Xn, Xm, uxtx #imm ==> CMP Xn, uxtb #imm
+  if (Opcode == ARM64::SUBSXrx64 && MI->getOperand(0).getReg() == ARM64::XZR) {
+    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", "
+      << getRegisterName(MI->getOperand(2).getReg());
+    printExtend(MI, 3, O);
+    return;
+  }
+
+  // ADDS WZR, Wn, #imm ==> CMN Wn, #imm
+  // ADDS XZR, Xn, #imm ==> CMN Xn, #imm
+  if ((Opcode == ARM64::ADDSWri && MI->getOperand(0).getReg() == ARM64::WZR) ||
+      (Opcode == ARM64::ADDSXri && MI->getOperand(0).getReg() == ARM64::XZR)) {
+    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printAddSubImm(MI, 2, O);
+    return;
+  }
+  // ADDS WZR, Wn, Wm{, lshift #imm} ==> CMN Wn, Wm{, lshift #imm}
+  // ADDS XZR, Xn, Xm{, lshift #imm} ==> CMN Xn, Xm{, lshift #imm}
+  if ((Opcode == ARM64::ADDSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
+      (Opcode == ARM64::ADDSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
+    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printShiftedRegister(MI, 2, O);
+    return;
+  }
+  // ADDS XZR, Xn, Wm, uxtb #imm ==> CMN Xn, uxtb #imm
+  if (Opcode == ARM64::ADDSXrx && MI->getOperand(0).getReg() == ARM64::XZR) {
+    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
+    printExtendedRegister(MI, 2, O);
+    return;
+  }
+  // ADDS XZR, Xn, Xm, uxtx #imm ==> CMN Xn, uxtb #imm
+  if (Opcode == ARM64::ADDSXrx64 && MI->getOperand(0).getReg() == ARM64::XZR) {
+    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", "
+      << getRegisterName(MI->getOperand(2).getReg());
+    printExtend(MI, 3, O);
+    return;
+  }
+
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+
+  printAnnotation(O, Annot);
+}
+
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case ARM64::TBXv8i8One:
+  case ARM64::TBXv8i8Two:
+  case ARM64::TBXv8i8Three:
+  case ARM64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case ARM64::TBLv8i8One:
+  case ARM64::TBLv8i8Two:
+  case ARM64::TBLv8i8Three:
+  case ARM64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case ARM64::TBXv16i8One:
+  case ARM64::TBXv16i8Two:
+  case ARM64::TBXv16i8Three:
+  case ARM64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case ARM64::TBLv16i8One:
+  case ARM64::TBLv16i8Two:
+  case ARM64::TBLv16i8Three:
+  case ARM64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
+
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int LaneOperand;
+  int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+  { ARM64::LD1i8,             "ld1",  ".b",     2, 0  },
+  { ARM64::LD1i16,            "ld1",  ".h",     2, 0  },
+  { ARM64::LD1i32,            "ld1",  ".s",     2, 0  },
+  { ARM64::LD1i64,            "ld1",  ".d",     2, 0  },
+  { ARM64::LD1i8_POST,        "ld1",  ".b",     2, 1  },
+  { ARM64::LD1i16_POST,       "ld1",  ".h",     2, 2  },
+  { ARM64::LD1i32_POST,       "ld1",  ".s",     2, 4  },
+  { ARM64::LD1i64_POST,       "ld1",  ".d",     2, 8  },
+  { ARM64::LD1Rv16b,          "ld1r", ".16b",   0, 0  },
+  { ARM64::LD1Rv8h,           "ld1r", ".8h",    0, 0  },
+  { ARM64::LD1Rv4s,           "ld1r", ".4s",    0, 0  },
+  { ARM64::LD1Rv2d,           "ld1r", ".2d",    0, 0  },
+  { ARM64::LD1Rv8b,           "ld1r", ".8b",    0, 0  },
+  { ARM64::LD1Rv4h,           "ld1r", ".4h",    0, 0  },
+  { ARM64::LD1Rv2s,           "ld1r", ".2s",    0, 0  },
+  { ARM64::LD1Rv1d,           "ld1r", ".1d",    0, 0  },
+  { ARM64::LD1Rv16b_POST,     "ld1r", ".16b",   0, 1  },
+  { ARM64::LD1Rv8h_POST,      "ld1r", ".8h",    0, 2  },
+  { ARM64::LD1Rv4s_POST,      "ld1r", ".4s",    0, 4  },
+  { ARM64::LD1Rv2d_POST,      "ld1r", ".2d",    0, 8  },
+  { ARM64::LD1Rv8b_POST,      "ld1r", ".8b",    0, 1  },
+  { ARM64::LD1Rv4h_POST,      "ld1r", ".4h",    0, 2  },
+  { ARM64::LD1Rv2s_POST,      "ld1r", ".2s",    0, 4  },
+  { ARM64::LD1Rv1d_POST,      "ld1r", ".1d",    0, 8  },
+  { ARM64::LD1Onev16b,        "ld1",  ".16b",   0, 0  },
+  { ARM64::LD1Onev8h,         "ld1",  ".8h",    0, 0  },
+  { ARM64::LD1Onev4s,         "ld1",  ".4s",    0, 0  },
+  { ARM64::LD1Onev2d,         "ld1",  ".2d",    0, 0  },
+  { ARM64::LD1Onev8b,         "ld1",  ".8b",    0, 0  },
+  { ARM64::LD1Onev4h,         "ld1",  ".4h",    0, 0  },
+  { ARM64::LD1Onev2s,         "ld1",  ".2s",    0, 0  },
+  { ARM64::LD1Onev1d,         "ld1",  ".1d",    0, 0  },
+  { ARM64::LD1Onev16b_POST,   "ld1",  ".16b",   0, 16 },
+  { ARM64::LD1Onev8h_POST,    "ld1",  ".8h",    0, 16 },
+  { ARM64::LD1Onev4s_POST,    "ld1",  ".4s",    0, 16 },
+  { ARM64::LD1Onev2d_POST,    "ld1",  ".2d",    0, 16 },
+  { ARM64::LD1Onev8b_POST,    "ld1",  ".8b",    0, 8  },
+  { ARM64::LD1Onev4h_POST,    "ld1",  ".4h",    0, 8  },
+  { ARM64::LD1Onev2s_POST,    "ld1",  ".2s",    0, 8  },
+  { ARM64::LD1Onev1d_POST,    "ld1",  ".1d",    0, 8  },
+  { ARM64::LD1Twov16b,        "ld1",  ".16b",   0, 0  },
+  { ARM64::LD1Twov8h,         "ld1",  ".8h",    0, 0  },
+  { ARM64::LD1Twov4s,         "ld1",  ".4s",    0, 0  },
+  { ARM64::LD1Twov2d,         "ld1",  ".2d",    0, 0  },
+  { ARM64::LD1Twov8b,         "ld1",  ".8b",    0, 0  },
+  { ARM64::LD1Twov4h,         "ld1",  ".4h",    0, 0  },
+  { ARM64::LD1Twov2s,         "ld1",  ".2s",    0, 0  },
+  { ARM64::LD1Twov1d,         "ld1",  ".1d",    0, 0  },
+  { ARM64::LD1Twov16b_POST,   "ld1",  ".16b",   0, 32 },
+  { ARM64::LD1Twov8h_POST,    "ld1",  ".8h",    0, 32 },
+  { ARM64::LD1Twov4s_POST,    "ld1",  ".4s",    0, 32 },
+  { ARM64::LD1Twov2d_POST,    "ld1",  ".2d",    0, 32 },
+  { ARM64::LD1Twov8b_POST,    "ld1",  ".8b",    0, 16 },
+  { ARM64::LD1Twov4h_POST,    "ld1",  ".4h",    0, 16 },
+  { ARM64::LD1Twov2s_POST,    "ld1",  ".2s",    0, 16 },
+  { ARM64::LD1Twov1d_POST,    "ld1",  ".1d",    0, 16 },
+  { ARM64::LD1Threev16b,      "ld1",  ".16b",   0, 0  },
+  { ARM64::LD1Threev8h,       "ld1",  ".8h",    0, 0  },
+  { ARM64::LD1Threev4s,       "ld1",  ".4s",    0, 0  },
+  { ARM64::LD1Threev2d,       "ld1",  ".2d",    0, 0  },
+  { ARM64::LD1Threev8b,       "ld1",  ".8b",    0, 0  },
+  { ARM64::LD1Threev4h,       "ld1",  ".4h",    0, 0  },
+  { ARM64::LD1Threev2s,       "ld1",  ".2s",    0, 0  },
+  { ARM64::LD1Threev1d,       "ld1",  ".1d",    0, 0  },
+  { ARM64::LD1Threev16b_POST, "ld1",  ".16b",   0, 48 },
+  { ARM64::LD1Threev8h_POST,  "ld1",  ".8h",    0, 48 },
+  { ARM64::LD1Threev4s_POST,  "ld1",  ".4s",    0, 48 },
+  { ARM64::LD1Threev2d_POST,  "ld1",  ".2d",    0, 48 },
+  { ARM64::LD1Threev8b_POST,  "ld1",  ".8b",    0, 24 },
+  { ARM64::LD1Threev4h_POST,  "ld1",  ".4h",    0, 24 },
+  { ARM64::LD1Threev2s_POST,  "ld1",  ".2s",    0, 24 },
+  { ARM64::LD1Threev1d_POST,  "ld1",  ".1d",    0, 24 },
+  { ARM64::LD1Fourv16b,       "ld1",  ".16b",   0, 0  },
+  { ARM64::LD1Fourv8h,        "ld1",  ".8h",    0, 0  },
+  { ARM64::LD1Fourv4s,        "ld1",  ".4s",    0, 0  },
+  { ARM64::LD1Fourv2d,        "ld1",  ".2d",    0, 0  },
+  { ARM64::LD1Fourv8b,        "ld1",  ".8b",    0, 0  },
+  { ARM64::LD1Fourv4h,        "ld1",  ".4h",    0, 0  },
+  { ARM64::LD1Fourv2s,        "ld1",  ".2s",    0, 0  },
+  { ARM64::LD1Fourv1d,        "ld1",  ".1d",    0, 0  },
+  { ARM64::LD1Fourv16b_POST,  "ld1",  ".16b",   0, 64 },
+  { ARM64::LD1Fourv8h_POST,   "ld1",  ".8h",    0, 64 },
+  { ARM64::LD1Fourv4s_POST,   "ld1",  ".4s",    0, 64 },
+  { ARM64::LD1Fourv2d_POST,   "ld1",  ".2d",    0, 64 },
+  { ARM64::LD1Fourv8b_POST,   "ld1",  ".8b",    0, 32 },
+  { ARM64::LD1Fourv4h_POST,   "ld1",  ".4h",    0, 32 },
+  { ARM64::LD1Fourv2s_POST,   "ld1",  ".2s",    0, 32 },
+  { ARM64::LD1Fourv1d_POST,   "ld1",  ".1d",    0, 32 },
+  { ARM64::LD2i8,             "ld2",  ".b",     2, 0  },
+  { ARM64::LD2i16,            "ld2",  ".h",     2, 0  },
+  { ARM64::LD2i32,            "ld2",  ".s",     2, 0  },
+  { ARM64::LD2i64,            "ld2",  ".d",     2, 0  },
+  { ARM64::LD2i8_POST,        "ld2",  ".b",     2, 2  },
+  { ARM64::LD2i16_POST,       "ld2",  ".h",     2, 4  },
+  { ARM64::LD2i32_POST,       "ld2",  ".s",     2, 8  },
+  { ARM64::LD2i64_POST,       "ld2",  ".d",     2, 16  },
+  { ARM64::LD2Rv16b,          "ld2r", ".16b",   0, 0  },
+  { ARM64::LD2Rv8h,           "ld2r", ".8h",    0, 0  },
+  { ARM64::LD2Rv4s,           "ld2r", ".4s",    0, 0  },
+  { ARM64::LD2Rv2d,           "ld2r", ".2d",    0, 0  },
+  { ARM64::LD2Rv8b,           "ld2r", ".8b",    0, 0  },
+  { ARM64::LD2Rv4h,           "ld2r", ".4h",    0, 0  },
+  { ARM64::LD2Rv2s,           "ld2r", ".2s",    0, 0  },
+  { ARM64::LD2Rv1d,           "ld2r", ".1d",    0, 0  },
+  { ARM64::LD2Rv16b_POST,     "ld2r", ".16b",   0, 2  },
+  { ARM64::LD2Rv8h_POST,      "ld2r", ".8h",    0, 4  },
+  { ARM64::LD2Rv4s_POST,      "ld2r", ".4s",    0, 8  },
+  { ARM64::LD2Rv2d_POST,      "ld2r", ".2d",    0, 16 },
+  { ARM64::LD2Rv8b_POST,      "ld2r", ".8b",    0, 2  },
+  { ARM64::LD2Rv4h_POST,      "ld2r", ".4h",    0, 4  },
+  { ARM64::LD2Rv2s_POST,      "ld2r", ".2s",    0, 8  },
+  { ARM64::LD2Rv1d_POST,      "ld2r", ".1d",    0, 16 },
+  { ARM64::LD2Twov16b,        "ld2",  ".16b",   0, 0  },
+  { ARM64::LD2Twov8h,         "ld2",  ".8h",    0, 0  },
+  { ARM64::LD2Twov4s,         "ld2",  ".4s",    0, 0  },
+  { ARM64::LD2Twov2d,         "ld2",  ".2d",    0, 0  },
+  { ARM64::LD2Twov8b,         "ld2",  ".8b",    0, 0  },
+  { ARM64::LD2Twov4h,         "ld2",  ".4h",    0, 0  },
+  { ARM64::LD2Twov2s,         "ld2",  ".2s",    0, 0  },
+  { ARM64::LD2Twov16b_POST,   "ld2",  ".16b",   0, 32 },
+  { ARM64::LD2Twov8h_POST,    "ld2",  ".8h",    0, 32 },
+  { ARM64::LD2Twov4s_POST,    "ld2",  ".4s",    0, 32 },
+  { ARM64::LD2Twov2d_POST,    "ld2",  ".2d",    0, 32 },
+  { ARM64::LD2Twov8b_POST,    "ld2",  ".8b",    0, 16 },
+  { ARM64::LD2Twov4h_POST,    "ld2",  ".4h",    0, 16 },
+  { ARM64::LD2Twov2s_POST,    "ld2",  ".2s",    0, 16 },
+  { ARM64::LD3i8,             "ld3",  ".b",     2, 0  },
+  { ARM64::LD3i16,            "ld3",  ".h",     2, 0  },
+  { ARM64::LD3i32,            "ld3",  ".s",     2, 0  },
+  { ARM64::LD3i64,            "ld3",  ".d",     2, 0  },
+  { ARM64::LD3i8_POST,        "ld3",  ".b",     2, 3  },
+  { ARM64::LD3i16_POST,       "ld3",  ".h",     2, 6  },
+  { ARM64::LD3i32_POST,       "ld3",  ".s",     2, 12  },
+  { ARM64::LD3i64_POST,       "ld3",  ".d",     2, 24  },
+  { ARM64::LD3Rv16b,          "ld3r", ".16b",   0, 0  },
+  { ARM64::LD3Rv8h,           "ld3r", ".8h",    0, 0  },
+  { ARM64::LD3Rv4s,           "ld3r", ".4s",    0, 0  },
+  { ARM64::LD3Rv2d,           "ld3r", ".2d",    0, 0  },
+  { ARM64::LD3Rv8b,           "ld3r", ".8b",    0, 0  },
+  { ARM64::LD3Rv4h,           "ld3r", ".4h",    0, 0  },
+  { ARM64::LD3Rv2s,           "ld3r", ".2s",    0, 0  },
+  { ARM64::LD3Rv1d,           "ld3r", ".1d",    0, 0  },
+  { ARM64::LD3Rv16b_POST,     "ld3r", ".16b",   0, 3  },
+  { ARM64::LD3Rv8h_POST,      "ld3r", ".8h",    0, 6  },
+  { ARM64::LD3Rv4s_POST,      "ld3r", ".4s",    0, 12 },
+  { ARM64::LD3Rv2d_POST,      "ld3r", ".2d",    0, 24 },
+  { ARM64::LD3Rv8b_POST,      "ld3r", ".8b",    0, 3  },
+  { ARM64::LD3Rv4h_POST,      "ld3r", ".4h",    0, 6  },
+  { ARM64::LD3Rv2s_POST,      "ld3r", ".2s",    0, 12 },
+  { ARM64::LD3Rv1d_POST,      "ld3r", ".1d",    0, 24 },
+  { ARM64::LD3Threev16b,      "ld3",  ".16b",   0, 0  },
+  { ARM64::LD3Threev8h,       "ld3",  ".8h",    0, 0  },
+  { ARM64::LD3Threev4s,       "ld3",  ".4s",    0, 0  },
+  { ARM64::LD3Threev2d,       "ld3",  ".2d",    0, 0  },
+  { ARM64::LD3Threev8b,       "ld3",  ".8b",    0, 0  },
+  { ARM64::LD3Threev4h,       "ld3",  ".4h",    0, 0  },
+  { ARM64::LD3Threev2s,       "ld3",  ".2s",    0, 0  },
+  { ARM64::LD3Threev16b_POST, "ld3",  ".16b",   0, 48 },
+  { ARM64::LD3Threev8h_POST,  "ld3",  ".8h",    0, 48 },
+  { ARM64::LD3Threev4s_POST,  "ld3",  ".4s",    0, 48 },
+  { ARM64::LD3Threev2d_POST,  "ld3",  ".2d",    0, 48 },
+  { ARM64::LD3Threev8b_POST,  "ld3",  ".8b",    0, 24 },
+  { ARM64::LD3Threev4h_POST,  "ld3",  ".4h",    0, 24 },
+  { ARM64::LD3Threev2s_POST,  "ld3",  ".2s",    0, 24 },
+  { ARM64::LD4i8,             "ld4",  ".b",     2, 0  },
+  { ARM64::LD4i16,            "ld4",  ".h",     2, 0  },
+  { ARM64::LD4i32,            "ld4",  ".s",     2, 0  },
+  { ARM64::LD4i64,            "ld4",  ".d",     2, 0  },
+  { ARM64::LD4i8_POST,        "ld4",  ".b",     2, 4  },
+  { ARM64::LD4i16_POST,       "ld4",  ".h",     2, 8  },
+  { ARM64::LD4i32_POST,       "ld4",  ".s",     2, 16 },
+  { ARM64::LD4i64_POST,       "ld4",  ".d",     2, 32 },
+  { ARM64::LD4Rv16b,          "ld4r", ".16b",   0, 0  },
+  { ARM64::LD4Rv8h,           "ld4r", ".8h",    0, 0  },
+  { ARM64::LD4Rv4s,           "ld4r", ".4s",    0, 0  },
+  { ARM64::LD4Rv2d,           "ld4r", ".2d",    0, 0  },
+  { ARM64::LD4Rv8b,           "ld4r", ".8b",    0, 0  },
+  { ARM64::LD4Rv4h,           "ld4r", ".4h",    0, 0  },
+  { ARM64::LD4Rv2s,           "ld4r", ".2s",    0, 0  },
+  { ARM64::LD4Rv1d,           "ld4r", ".1d",    0, 0  },
+  { ARM64::LD4Rv16b_POST,     "ld4r", ".16b",   0, 4  },
+  { ARM64::LD4Rv8h_POST,      "ld4r", ".8h",    0, 8  },
+  { ARM64::LD4Rv4s_POST,      "ld4r", ".4s",    0, 16 },
+  { ARM64::LD4Rv2d_POST,      "ld4r", ".2d",    0, 32 },
+  { ARM64::LD4Rv8b_POST,      "ld4r", ".8b",    0, 4  },
+  { ARM64::LD4Rv4h_POST,      "ld4r", ".4h",    0, 8  },
+  { ARM64::LD4Rv2s_POST,      "ld4r", ".2s",    0, 16 },
+  { ARM64::LD4Rv1d_POST,      "ld4r", ".1d",    0, 32 },
+  { ARM64::LD4Fourv16b,       "ld4",  ".16b",   0, 0  },
+  { ARM64::LD4Fourv8h,        "ld4",  ".8h",    0, 0  },
+  { ARM64::LD4Fourv4s,        "ld4",  ".4s",    0, 0  },
+  { ARM64::LD4Fourv2d,        "ld4",  ".2d",    0, 0  },
+  { ARM64::LD4Fourv8b,        "ld4",  ".8b",    0, 0  },
+  { ARM64::LD4Fourv4h,        "ld4",  ".4h",    0, 0  },
+  { ARM64::LD4Fourv2s,        "ld4",  ".2s",    0, 0  },
+  { ARM64::LD4Fourv16b_POST,  "ld4",  ".16b",   0, 64 },
+  { ARM64::LD4Fourv8h_POST,   "ld4",  ".8h",    0, 64 },
+  { ARM64::LD4Fourv4s_POST,   "ld4",  ".4s",    0, 64 },
+  { ARM64::LD4Fourv2d_POST,   "ld4",  ".2d",    0, 64 },
+  { ARM64::LD4Fourv8b_POST,   "ld4",  ".8b",    0, 32 },
+  { ARM64::LD4Fourv4h_POST,   "ld4",  ".4h",    0, 32 },
+  { ARM64::LD4Fourv2s_POST,   "ld4",  ".2s",    0, 32 },
+  { ARM64::ST1i8,             "st1",  ".b",     1, 0  },
+  { ARM64::ST1i16,            "st1",  ".h",     1, 0  },
+  { ARM64::ST1i32,            "st1",  ".s",     1, 0  },
+  { ARM64::ST1i64,            "st1",  ".d",     1, 0  },
+  { ARM64::ST1i8_POST,        "st1",  ".b",     1, 1  },
+  { ARM64::ST1i16_POST,       "st1",  ".h",     1, 2  },
+  { ARM64::ST1i32_POST,       "st1",  ".s",     1, 4  },
+  { ARM64::ST1i64_POST,       "st1",  ".d",     1, 8  },
+  { ARM64::ST1Onev16b,        "st1",  ".16b",   0, 0  },
+  { ARM64::ST1Onev8h,         "st1",  ".8h",    0, 0  },
+  { ARM64::ST1Onev4s,         "st1",  ".4s",    0, 0  },
+  { ARM64::ST1Onev2d,         "st1",  ".2d",    0, 0  },
+  { ARM64::ST1Onev8b,         "st1",  ".8b",    0, 0  },
+  { ARM64::ST1Onev4h,         "st1",  ".4h",    0, 0  },
+  { ARM64::ST1Onev2s,         "st1",  ".2s",    0, 0  },
+  { ARM64::ST1Onev1d,         "st1",  ".1d",    0, 0  },
+  { ARM64::ST1Onev16b_POST,   "st1",  ".16b",   0, 16 },
+  { ARM64::ST1Onev8h_POST,    "st1",  ".8h",    0, 16 },
+  { ARM64::ST1Onev4s_POST,    "st1",  ".4s",    0, 16 },
+  { ARM64::ST1Onev2d_POST,    "st1",  ".2d",    0, 16 },
+  { ARM64::ST1Onev8b_POST,    "st1",  ".8b",    0, 8  },
+  { ARM64::ST1Onev4h_POST,    "st1",  ".4h",    0, 8  },
+  { ARM64::ST1Onev2s_POST,    "st1",  ".2s",    0, 8  },
+  { ARM64::ST1Onev1d_POST,    "st1",  ".1d",    0, 8  },
+  { ARM64::ST1Twov16b,        "st1",  ".16b",   0, 0  },
+  { ARM64::ST1Twov8h,         "st1",  ".8h",    0, 0  },
+  { ARM64::ST1Twov4s,         "st1",  ".4s",    0, 0  },
+  { ARM64::ST1Twov2d,         "st1",  ".2d",    0, 0  },
+  { ARM64::ST1Twov8b,         "st1",  ".8b",    0, 0  },
+  { ARM64::ST1Twov4h,         "st1",  ".4h",    0, 0  },
+  { ARM64::ST1Twov2s,         "st1",  ".2s",    0, 0  },
+  { ARM64::ST1Twov1d,         "st1",  ".1d",    0, 0  },
+  { ARM64::ST1Twov16b_POST,   "st1",  ".16b",   0, 32 },
+  { ARM64::ST1Twov8h_POST,    "st1",  ".8h",    0, 32 },
+  { ARM64::ST1Twov4s_POST,    "st1",  ".4s",    0, 32 },
+  { ARM64::ST1Twov2d_POST,    "st1",  ".2d",    0, 32 },
+  { ARM64::ST1Twov8b_POST,    "st1",  ".8b",    0, 16 },
+  { ARM64::ST1Twov4h_POST,    "st1",  ".4h",    0, 16 },
+  { ARM64::ST1Twov2s_POST,    "st1",  ".2s",    0, 16 },
+  { ARM64::ST1Twov1d_POST,    "st1",  ".1d",    0, 16 },
+  { ARM64::ST1Threev16b,      "st1",  ".16b",   0, 0  },
+  { ARM64::ST1Threev8h,       "st1",  ".8h",    0, 0  },
+  { ARM64::ST1Threev4s,       "st1",  ".4s",    0, 0  },
+  { ARM64::ST1Threev2d,       "st1",  ".2d",    0, 0  },
+  { ARM64::ST1Threev8b,       "st1",  ".8b",    0, 0  },
+  { ARM64::ST1Threev4h,       "st1",  ".4h",    0, 0  },
+  { ARM64::ST1Threev2s,       "st1",  ".2s",    0, 0  },
+  { ARM64::ST1Threev1d,       "st1",  ".1d",    0, 0  },
+  { ARM64::ST1Threev16b_POST, "st1",  ".16b",   0, 48 },
+  { ARM64::ST1Threev8h_POST,  "st1",  ".8h",    0, 48 },
+  { ARM64::ST1Threev4s_POST,  "st1",  ".4s",    0, 48 },
+  { ARM64::ST1Threev2d_POST,  "st1",  ".2d",    0, 48 },
+  { ARM64::ST1Threev8b_POST,  "st1",  ".8b",    0, 24 },
+  { ARM64::ST1Threev4h_POST,  "st1",  ".4h",    0, 24 },
+  { ARM64::ST1Threev2s_POST,  "st1",  ".2s",    0, 24 },
+  { ARM64::ST1Threev1d_POST,  "st1",  ".1d",    0, 24 },
+  { ARM64::ST1Fourv16b,       "st1",  ".16b",   0, 0  },
+  { ARM64::ST1Fourv8h,        "st1",  ".8h",    0, 0  },
+  { ARM64::ST1Fourv4s,        "st1",  ".4s",    0, 0  },
+  { ARM64::ST1Fourv2d,        "st1",  ".2d",    0, 0  },
+  { ARM64::ST1Fourv8b,        "st1",  ".8b",    0, 0  },
+  { ARM64::ST1Fourv4h,        "st1",  ".4h",    0, 0  },
+  { ARM64::ST1Fourv2s,        "st1",  ".2s",    0, 0  },
+  { ARM64::ST1Fourv1d,        "st1",  ".1d",    0, 0  },
+  { ARM64::ST1Fourv16b_POST,  "st1",  ".16b",   0, 64 },
+  { ARM64::ST1Fourv8h_POST,   "st1",  ".8h",    0, 64 },
+  { ARM64::ST1Fourv4s_POST,   "st1",  ".4s",    0, 64 },
+  { ARM64::ST1Fourv2d_POST,   "st1",  ".2d",    0, 64 },
+  { ARM64::ST1Fourv8b_POST,   "st1",  ".8b",    0, 32 },
+  { ARM64::ST1Fourv4h_POST,   "st1",  ".4h",    0, 32 },
+  { ARM64::ST1Fourv2s_POST,   "st1",  ".2s",    0, 32 },
+  { ARM64::ST1Fourv1d_POST,   "st1",  ".1d",    0, 32 },
+  { ARM64::ST2i8,             "st2",  ".b",     1, 0  },
+  { ARM64::ST2i16,            "st2",  ".h",     1, 0  },
+  { ARM64::ST2i32,            "st2",  ".s",     1, 0  },
+  { ARM64::ST2i64,            "st2",  ".d",     1, 0  },
+  { ARM64::ST2i8_POST,        "st2",  ".b",     1, 2  },
+  { ARM64::ST2i16_POST,       "st2",  ".h",     1, 4  },
+  { ARM64::ST2i32_POST,       "st2",  ".s",     1, 8  },
+  { ARM64::ST2i64_POST,       "st2",  ".d",     1, 16 },
+  { ARM64::ST2Twov16b,        "st2",  ".16b",   0, 0  },
+  { ARM64::ST2Twov8h,         "st2",  ".8h",    0, 0  },
+  { ARM64::ST2Twov4s,         "st2",  ".4s",    0, 0  },
+  { ARM64::ST2Twov2d,         "st2",  ".2d",    0, 0  },
+  { ARM64::ST2Twov8b,         "st2",  ".8b",    0, 0  },
+  { ARM64::ST2Twov4h,         "st2",  ".4h",    0, 0  },
+  { ARM64::ST2Twov2s,         "st2",  ".2s",    0, 0  },
+  { ARM64::ST2Twov16b_POST,   "st2",  ".16b",   0, 32 },
+  { ARM64::ST2Twov8h_POST,    "st2",  ".8h",    0, 32 },
+  { ARM64::ST2Twov4s_POST,    "st2",  ".4s",    0, 32 },
+  { ARM64::ST2Twov2d_POST,    "st2",  ".2d",    0, 32 },
+  { ARM64::ST2Twov8b_POST,    "st2",  ".8b",    0, 16 },
+  { ARM64::ST2Twov4h_POST,    "st2",  ".4h",    0, 16 },
+  { ARM64::ST2Twov2s_POST,    "st2",  ".2s",    0, 16 },
+  { ARM64::ST3i8,             "st3",  ".b",     1, 0  },
+  { ARM64::ST3i16,            "st3",  ".h",     1, 0  },
+  { ARM64::ST3i32,            "st3",  ".s",     1, 0  },
+  { ARM64::ST3i64,            "st3",  ".d",     1, 0  },
+  { ARM64::ST3i8_POST,        "st3",  ".b",     1, 3  },
+  { ARM64::ST3i16_POST,       "st3",  ".h",     1, 6  },
+  { ARM64::ST3i32_POST,       "st3",  ".s",     1, 12 },
+  { ARM64::ST3i64_POST,       "st3",  ".d",     1, 24 },
+  { ARM64::ST3Threev16b,      "st3",  ".16b",   0, 0  },
+  { ARM64::ST3Threev8h,       "st3",  ".8h",    0, 0  },
+  { ARM64::ST3Threev4s,       "st3",  ".4s",    0, 0  },
+  { ARM64::ST3Threev2d,       "st3",  ".2d",    0, 0  },
+  { ARM64::ST3Threev8b,       "st3",  ".8b",    0, 0  },
+  { ARM64::ST3Threev4h,       "st3",  ".4h",    0, 0  },
+  { ARM64::ST3Threev2s,       "st3",  ".2s",    0, 0  },
+  { ARM64::ST3Threev16b_POST, "st3",  ".16b",   0, 48 },
+  { ARM64::ST3Threev8h_POST,  "st3",  ".8h",    0, 48 },
+  { ARM64::ST3Threev4s_POST,  "st3",  ".4s",    0, 48 },
+  { ARM64::ST3Threev2d_POST,  "st3",  ".2d",    0, 48 },
+  { ARM64::ST3Threev8b_POST,  "st3",  ".8b",    0, 24 },
+  { ARM64::ST3Threev4h_POST,  "st3",  ".4h",    0, 24 },
+  { ARM64::ST3Threev2s_POST,  "st3",  ".2s",    0, 24 },
+  { ARM64::ST4i8,             "st4",  ".b",     1, 0  },
+  { ARM64::ST4i16,            "st4",  ".h",     1, 0  },
+  { ARM64::ST4i32,            "st4",  ".s",     1, 0  },
+  { ARM64::ST4i64,            "st4",  ".d",     1, 0  },
+  { ARM64::ST4i8_POST,        "st4",  ".b",     1, 4  },
+  { ARM64::ST4i16_POST,       "st4",  ".h",     1, 8  },
+  { ARM64::ST4i32_POST,       "st4",  ".s",     1, 16 },
+  { ARM64::ST4i64_POST,       "st4",  ".d",     1, 32 },
+  { ARM64::ST4Fourv16b,       "st4",  ".16b",   0, 0  },
+  { ARM64::ST4Fourv8h,        "st4",  ".8h",    0, 0  },
+  { ARM64::ST4Fourv4s,        "st4",  ".4s",    0, 0  },
+  { ARM64::ST4Fourv2d,        "st4",  ".2d",    0, 0  },
+  { ARM64::ST4Fourv8b,        "st4",  ".8b",    0, 0  },
+  { ARM64::ST4Fourv4h,        "st4",  ".4h",    0, 0  },
+  { ARM64::ST4Fourv2s,        "st4",  ".2s",    0, 0  },
+  { ARM64::ST4Fourv16b_POST,  "st4",  ".16b",   0, 64 },
+  { ARM64::ST4Fourv8h_POST,   "st4",  ".8h",    0, 64 },
+  { ARM64::ST4Fourv4s_POST,   "st4",  ".4s",    0, 64 },
+  { ARM64::ST4Fourv2d_POST,   "st4",  ".2d",    0, 64 },
+  { ARM64::ST4Fourv8b_POST,   "st4",  ".8b",    0, 32 },
+  { ARM64::ST4Fourv4h_POST,   "st4",  ".4h",    0, 32 },
+  { ARM64::ST4Fourv2s_POST,   "st4",  ".2s",    0, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return 0;
+}
+
+void ARM64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                      StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
+
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), ARM64::vreg) << ", ";
+
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, O, "");
+
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), ARM64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    printVectorList(MI, 0, O, "");
+
+    if (LdStDesc->LaneOperand != 0)
+      O << '[' << MI->getOperand(LdStDesc->LaneOperand).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrOpNum = LdStDesc->LaneOperand + 1;
+    unsigned AddrReg = MI->getOperand(AddrOpNum).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(AddrOpNum + 1).getReg();
+      if (Reg != ARM64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
+
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  ARM64InstPrinter::printInst(MI, O, Annot);
+}
+
+bool ARM64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert((Opcode == ARM64::SYS || Opcode == ARM64::SYSxt) &&
+         "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = 0;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
+
+  if (Asm) {
+    O << '\t' << Asm;
+    if (MI->getNumOperands() == 5)
+      O << ", " << getRegisterName(MI->getOperand(4).getReg());
+  }
+
+  return Asm != 0;
+}
+
+void ARM64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void ARM64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                           unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == ARM64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    assert(0 && "unknown operand kind in printPostIncOperand64");
+}
+
+void ARM64InstPrinter::printPostIncOperand1(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 1, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand2(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 2, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand3(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 3, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand4(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 4, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand6(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 6, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand8(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 8, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand12(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 12, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand16(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 16, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand24(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 24, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand32(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 32, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand48(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 48, O);
+}
+
+void ARM64InstPrinter::printPostIncOperand64(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  printPostIncOperand(MI, OpNo, 64, O);
+}
+
+void ARM64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, ARM64::vreg);
+}
+
+void ARM64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
+}
+
+void ARM64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        ARM64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << (Val << Shift);
+    // Distinguish "0, lsl #12" from "0, lsl #0".
+    if (Val == 0 && Shift != 0)
+      printShifter(MI, OpNum + 1, O);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+    printShifter(MI, OpNum + 1, O);
+  }
+}
+
+void ARM64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 32));
+}
+
+void ARM64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 64));
+}
+
+void ARM64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
+      ARM64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << ARM64_AM::getShiftName(ARM64_AM::getShiftType(Val)) << " #"
+    << ARM64_AM::getShiftValue(Val);
+}
+
+void ARM64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, O);
+}
+
+void ARM64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                             raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printExtend(MI, OpNum + 1, O);
+}
+
+void ARM64InstPrinter::printExtend(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  ARM64_AM::ExtendType ExtType = ARM64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = ARM64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if (Dest == ARM64::SP || Dest == ARM64::WSP || Src1 == ARM64::SP ||
+        Src1 == ARM64::WSP) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
+  }
+  O << ", " << ARM64_AM::getExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
+}
+
+void ARM64InstPrinter::printDotCondCode(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
+  if (CC != ARM64CC::AL)
+    O << '.' << ARM64CC::getCondCodeName(CC);
+}
+
+void ARM64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << ARM64CC::getCondCodeName(CC);
+}
+
+void ARM64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
+
+void ARM64InstPrinter::printImmScale4(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << '#' << 4 * MI->getOperand(OpNum).getImm();
+}
+
+void ARM64InstPrinter::printImmScale8(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << '#' << 8 * MI->getOperand(OpNum).getImm();
+}
+
+void ARM64InstPrinter::printImmScale16(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << '#' << 16 * MI->getOperand(OpNum).getImm();
+}
+
+void ARM64InstPrinter::printAMIndexed(const MCInst *MI, unsigned OpNum,
+                                      unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+    if (MO1.getImm() != 0)
+      O << ", #" << (MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", " << *MO1.getExpr();
+  }
+  O << ']';
+}
+
+void ARM64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  if (ARM64_AM::isNamedPrefetchOp(prfop))
+    O << ARM64_AM::getPrefetchOpName((ARM64_AM::PrefetchOp)prfop);
+  else
+    O << '#' << prfop;
+}
+
+void ARM64InstPrinter::printMemoryPostIndexed32(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
+    << 4 * MI->getOperand(OpNum + 1).getImm();
+}
+
+void ARM64InstPrinter::printMemoryPostIndexed64(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
+    << 8 * MI->getOperand(OpNum + 1).getImm();
+}
+
+void ARM64InstPrinter::printMemoryPostIndexed128(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
+    << 16 * MI->getOperand(OpNum + 1).getImm();
+}
+
+void ARM64InstPrinter::printMemoryPostIndexed(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
+    << MI->getOperand(OpNum + 1).getImm();
+}
+
+void ARM64InstPrinter::printMemoryRegOffset(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O, int LegalShiftAmt) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
+    << getRegisterName(MI->getOperand(OpNum + 1).getReg());
+
+  unsigned Val = MI->getOperand(OpNum + 2).getImm();
+  ARM64_AM::ExtendType ExtType = ARM64_AM::getMemExtendType(Val);
+  bool DoShift = ARM64_AM::getMemDoShift(Val);
+
+  if (ExtType == ARM64_AM::UXTX) {
+    if (DoShift)
+      O << ", lsl";
+  } else
+    O << ", " << ARM64_AM::getExtendName(ExtType);
+
+  if (DoShift)
+    O << " #" << LegalShiftAmt;
+
+  O << "]";
+}
+
+void ARM64InstPrinter::printMemoryRegOffset8(const MCInst *MI, unsigned OpNum,
+                                             raw_ostream &O) {
+  printMemoryRegOffset(MI, OpNum, O, 0);
+}
+
+void ARM64InstPrinter::printMemoryRegOffset16(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  printMemoryRegOffset(MI, OpNum, O, 1);
+}
+
+void ARM64InstPrinter::printMemoryRegOffset32(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  printMemoryRegOffset(MI, OpNum, O, 2);
+}
+
+void ARM64InstPrinter::printMemoryRegOffset64(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  printMemoryRegOffset(MI, OpNum, O, 3);
+}
+
+void ARM64InstPrinter::printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  printMemoryRegOffset(MI, OpNum, O, 4);
+}
+
+void ARM64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << '#';
+  if (MO.isFPImm())
+    // FIXME: Should this ever happen?
+    O << MO.getFPImm();
+  else
+    O << ARM64_AM::getFPImmFloat(MO.getImm());
+}
+
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      assert(0 && "Vector register expected!");
+    case ARM64::Q0:  Reg = ARM64::Q1;  break;
+    case ARM64::Q1:  Reg = ARM64::Q2;  break;
+    case ARM64::Q2:  Reg = ARM64::Q3;  break;
+    case ARM64::Q3:  Reg = ARM64::Q4;  break;
+    case ARM64::Q4:  Reg = ARM64::Q5;  break;
+    case ARM64::Q5:  Reg = ARM64::Q6;  break;
+    case ARM64::Q6:  Reg = ARM64::Q7;  break;
+    case ARM64::Q7:  Reg = ARM64::Q8;  break;
+    case ARM64::Q8:  Reg = ARM64::Q9;  break;
+    case ARM64::Q9:  Reg = ARM64::Q10; break;
+    case ARM64::Q10: Reg = ARM64::Q11; break;
+    case ARM64::Q11: Reg = ARM64::Q12; break;
+    case ARM64::Q12: Reg = ARM64::Q13; break;
+    case ARM64::Q13: Reg = ARM64::Q14; break;
+    case ARM64::Q14: Reg = ARM64::Q15; break;
+    case ARM64::Q15: Reg = ARM64::Q16; break;
+    case ARM64::Q16: Reg = ARM64::Q17; break;
+    case ARM64::Q17: Reg = ARM64::Q18; break;
+    case ARM64::Q18: Reg = ARM64::Q19; break;
+    case ARM64::Q19: Reg = ARM64::Q20; break;
+    case ARM64::Q20: Reg = ARM64::Q21; break;
+    case ARM64::Q21: Reg = ARM64::Q22; break;
+    case ARM64::Q22: Reg = ARM64::Q23; break;
+    case ARM64::Q23: Reg = ARM64::Q24; break;
+    case ARM64::Q24: Reg = ARM64::Q25; break;
+    case ARM64::Q25: Reg = ARM64::Q26; break;
+    case ARM64::Q26: Reg = ARM64::Q27; break;
+    case ARM64::Q27: Reg = ARM64::Q28; break;
+    case ARM64::Q28: Reg = ARM64::Q29; break;
+    case ARM64::Q29: Reg = ARM64::Q30; break;
+    case ARM64::Q30: Reg = ARM64::Q31; break;
+    // Vector lists can wrap around.
+    case ARM64::Q31:
+      Reg = ARM64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
+
+void ARM64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O, StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  O << "{ ";
+
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(ARM64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(ARM64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(ARM64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(ARM64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(ARM64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(ARM64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(ARM64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC = MRI.getRegClass(ARM64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, ARM64::dsub, &FPR128RC);
+  }
+
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, ARM64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
+  }
+
+  O << " }";
+}
+
+void ARM64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                      unsigned OpNum,
+                                                      raw_ostream &O) {
+  printVectorList(MI, OpNum, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void ARM64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
+
+  printVectorList(MI, OpNum, O, Suffix);
+}
+
+void ARM64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void ARM64InstPrinter::printAlignedBranchTarget(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 2);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
+  } else {
+    // Otherwise, just print the expression.
+    O << *MI->getOperand(OpNum).getExpr();
+  }
+}
+
+void ARM64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 12);
+    return;
+  }
+
+  // Otherwise, just print the expression.
+  O << *MI->getOperand(OpNum).getExpr();
+}
+
+void ARM64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  const char *Name = ARM64SYS::getBarrierOptName((ARM64SYS::BarrierOption)Val);
+  if (Name)
+    O << Name;
+  else
+    O << "#" << Val;
+}
+
+void ARM64InstPrinter::printSystemRegister(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  const char *Name =
+      ARM64SYS::getSystemRegisterName((ARM64SYS::SystemRegister)Val);
+  if (Name) {
+    O << Name;
+    return;
+  }
+
+  unsigned Op0 = 2 | ((Val >> 14) & 1);
+  unsigned Op1 = (Val >> 11) & 7;
+  unsigned CRn = (Val >> 7) & 0xf;
+  unsigned CRm = (Val >> 3) & 0xf;
+  unsigned Op2 = Val & 7;
+
+  O << 'S' << Op0 << '_' << Op1 << "_C" << CRn << "_C" << CRm << '_' << Op2;
+}
+
+void ARM64InstPrinter::printSystemCPSRField(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  const char *Name = ARM64SYS::getCPSRFieldName((ARM64SYS::CPSRField)Val);
+  O << Name;
+}
+
+void ARM64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = ARM64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
+}
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
new file mode 100644
index 0000000..ff66ff0
--- /dev/null
+++ b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
@@ -0,0 +1,157 @@
+//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64INSTPRINTER_H
+#define ARM64INSTPRINTER_H
+
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class ARM64InstPrinter : public MCInstPrinter {
+public:
+  ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = ARM64::NoRegAltName);
+
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  void printPostIncOperand1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand2(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand3(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand4(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand6(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand8(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand12(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand16(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand24(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand32(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand48(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printDotCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedBranchTarget(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printAMIndexed(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                      raw_ostream &O);
+  void printAMIndexed128(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 16, O);
+  }
+
+  void printAMIndexed64(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 8, O);
+  }
+
+  void printAMIndexed32(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 4, O);
+  }
+
+  void printAMIndexed16(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 2, O);
+  }
+
+  void printAMIndexed8(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 1, O);
+  }
+  void printAMUnscaled(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexed(MI, OpNum, 1, O);
+  }
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmScale4(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmScale8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmScale16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryPostIndexed(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryPostIndexed32(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printMemoryPostIndexed64(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printMemoryPostIndexed128(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
+  void printMemoryRegOffset(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                            int LegalShiftAmt);
+  void printMemoryRegOffset8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
+
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemCPSRField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
+
+class ARM64AppleInstPrinter : public ARM64InstPrinter {
+public:
+  ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = ARM64::NoRegAltName);
+};
+}
+
+#endif
diff --git a/lib/Target/ARM64/InstPrinter/CMakeLists.txt b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..b8ee12c
--- /dev/null
+++ b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64AsmPrinter
+  ARM64InstPrinter.cpp
+  )
+
+add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000..2ec83d2
--- /dev/null
+++ b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64AsmPrinter
+parent = ARM64
+required_libraries = MC Support
+add_to_library_groups = ARM64
+
diff --git a/lib/Target/ARM64/InstPrinter/Makefile b/lib/Target/ARM64/InstPrinter/Makefile
new file mode 100644
index 0000000..a59efb0
--- /dev/null
+++ b/lib/Target/ARM64/InstPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64AsmPrinter
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/LLVMBuild.txt b/lib/Target/ARM64/LLVMBuild.txt
new file mode 100644
index 0000000..45b0628
--- /dev/null
+++ b/lib/Target/ARM64/LLVMBuild.txt
@@ -0,0 +1,36 @@
+;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = ARM64
+parent = Target
+has_asmparser = 1
+has_asmprinter = 1
+has_disassembler = 1
+has_jit = 1
+
+[component_1]
+type = Library
+name = ARM64CodeGen
+parent = ARM64
+required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
+add_to_library_groups = ARM64
+
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h b/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
new file mode 100644
index 0000000..7717743
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
@@ -0,0 +1,758 @@
+//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
+#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM64_AM - ARM64 Addressing Mode Stuff
+namespace ARM64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftType {
+  InvalidShift = -1,
+  LSL = 0,
+  LSR = 1,
+  ASR = 2,
+  ROR = 3,
+  MSL = 4
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftName(ARM64_AM::ShiftType ST) {
+  switch (ST) {
+  default: assert(false && "unhandled shift type!");
+  case ARM64_AM::LSL: return "lsl";
+  case ARM64_AM::LSR: return "lsr";
+  case ARM64_AM::ASR: return "asr";
+  case ARM64_AM::ROR: return "ror";
+  case ARM64_AM::MSL: return "msl";
+  }
+  return 0;
+}
+
+/// getShiftType - Extract the shift type.
+static inline ARM64_AM::ShiftType getShiftType(unsigned Imm) {
+  return ARM64_AM::ShiftType((Imm >> 6) & 0x7);
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+  return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+///   imm:     6-bit shift amount
+///   shifter: 000 ==> lsl
+///            001 ==> lsr
+///            010 ==> asr
+///            011 ==> ror
+///            100 ==> msl
+///   {8-6}  = shifter
+///   {5-0}  = imm
+static inline unsigned getShifterImm(ARM64_AM::ShiftType ST, unsigned Imm) {
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  return (unsigned(ST) << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+enum ExtendType {
+  InvalidExtend = -1,
+  UXTB = 0,
+  UXTH = 1,
+  UXTW = 2,
+  UXTX = 3,
+  SXTB = 4,
+  SXTH = 5,
+  SXTW = 6,
+  SXTX = 7
+};
+
+/// getExtendName - Get the string encoding for the extend type.
+static inline const char *getExtendName(ARM64_AM::ExtendType ET) {
+  switch (ET) {
+  default: assert(false && "unhandled extend type!");
+  case ARM64_AM::UXTB: return "uxtb";
+  case ARM64_AM::UXTH: return "uxth";
+  case ARM64_AM::UXTW: return "uxtw";
+  case ARM64_AM::UXTX: return "uxtx";
+  case ARM64_AM::SXTB: return "sxtb";
+  case ARM64_AM::SXTH: return "sxth";
+  case ARM64_AM::SXTW: return "sxtw";
+  case ARM64_AM::SXTX: return "sxtx";
+  }
+  return 0;
+}
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+  return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline ARM64_AM::ExtendType getArithExtendType(unsigned Imm) {
+  return ARM64_AM::ExtendType((Imm >> 3) & 0x7);
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {5-3}  = shifter
+///   {2-0}  = imm3
+static inline unsigned getArithExtendImm(ARM64_AM::ExtendType ET,
+                                         unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (unsigned(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+  return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline ARM64_AM::ExtendType getMemExtendType(unsigned Imm) {
+  return ARM64_AM::ExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+///   doshift:     should the offset be scaled by the access size
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {3-1}  = shifter
+///   {0}  = doshift
+static inline unsigned getMemExtendImm(ARM64_AM::ExtendType ET, bool DoShift) {
+  return (unsigned(ET) << 1) | unsigned(DoShift);
+}
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//
+
+/// Pre-fetch operator names.
+/// The enum values match the encoding values:
+///   prfop<4:3> 00=preload data, 10=prepare for store
+///   prfop<2:1> 00=target L1 cache, 01=target L2 cache, 10=target L3 cache,
+///   prfop<0> 0=non-streaming (temporal), 1=streaming (non-temporal)
+enum PrefetchOp {
+  InvalidPrefetchOp = -1,
+  PLDL1KEEP = 0x00,
+  PLDL1STRM = 0x01,
+  PLDL2KEEP = 0x02,
+  PLDL2STRM = 0x03,
+  PLDL3KEEP = 0x04,
+  PLDL3STRM = 0x05,
+  PSTL1KEEP = 0x10,
+  PSTL1STRM = 0x11,
+  PSTL2KEEP = 0x12,
+  PSTL2STRM = 0x13,
+  PSTL3KEEP = 0x14,
+  PSTL3STRM = 0x15
+};
+
+/// isNamedPrefetchOp - Check if the prefetch-op 5-bit value has a name.
+static inline bool isNamedPrefetchOp(unsigned prfop) {
+  switch (prfop) {
+  default: return false;
+  case ARM64_AM::PLDL1KEEP: case ARM64_AM::PLDL1STRM: case ARM64_AM::PLDL2KEEP:
+  case ARM64_AM::PLDL2STRM: case ARM64_AM::PLDL3KEEP: case ARM64_AM::PLDL3STRM:
+  case ARM64_AM::PSTL1KEEP: case ARM64_AM::PSTL1STRM: case ARM64_AM::PSTL2KEEP:
+  case ARM64_AM::PSTL2STRM: case ARM64_AM::PSTL3KEEP: case ARM64_AM::PSTL3STRM:
+    return true;
+  }
+}
+
+
+/// getPrefetchOpName - Get the string encoding for the prefetch operator.
+static inline const char *getPrefetchOpName(ARM64_AM::PrefetchOp prfop) {
+  switch (prfop) {
+  default: assert(false && "unhandled prefetch-op type!");
+  case ARM64_AM::PLDL1KEEP: return "pldl1keep";
+  case ARM64_AM::PLDL1STRM: return "pldl1strm";
+  case ARM64_AM::PLDL2KEEP: return "pldl2keep";
+  case ARM64_AM::PLDL2STRM: return "pldl2strm";
+  case ARM64_AM::PLDL3KEEP: return "pldl3keep";
+  case ARM64_AM::PLDL3STRM: return "pldl3strm";
+  case ARM64_AM::PSTL1KEEP: return "pstl1keep";
+  case ARM64_AM::PSTL1STRM: return "pstl1strm";
+  case ARM64_AM::PSTL2KEEP: return "pstl2keep";
+  case ARM64_AM::PSTL2STRM: return "pstl2strm";
+  case ARM64_AM::PSTL3KEEP: return "pstl3keep";
+  case ARM64_AM::PSTL3STRM: return "pstl3strm";
+  }
+  return 0;
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+  return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size.  If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
+                                           uint64_t &encoding) {
+  if (imm == 0ULL || imm == ~0ULL ||
+      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+    return false;
+
+  unsigned size = 2;
+  uint64_t eltVal = imm;
+
+  // First, determine the element size.
+  while (size < regSize) {
+    unsigned numElts = regSize / size;
+    unsigned mask = (1ULL << size) - 1;
+    uint64_t lowestEltVal = imm & mask;
+
+    bool allMatched = true;
+    for (unsigned i = 1; i < numElts; ++i) {
+     uint64_t currEltVal = (imm >> (i*size)) & mask;
+      if (currEltVal != lowestEltVal) {
+        allMatched = false;
+        break;
+      }
+    }
+
+    if (allMatched) {
+      eltVal = lowestEltVal;
+      break;
+    }
+
+    size *= 2;
+  }
+
+  // Second, determine the rotation to make the element be: 0^m 1^n.
+  for (unsigned i = 0; i < size; ++i) {
+    eltVal = ror(eltVal, size);
+    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
+    uint32_t cto = CountTrailingOnes_64(eltVal);
+
+    if (clz + cto == size) {
+      // Encode in immr the number of RORs it would take to get *from* this
+      // element value to our target value, where i+1 is the number of RORs
+      // to go the opposite direction.
+      unsigned immr = size - (i + 1);
+
+      // If size has a 1 in the n'th bit, create a value that has zeroes in
+      // bits [0, n] and ones above that.
+      uint64_t nimms = ~(size-1) << 1;
+
+      // Or the CTO value into the low bits, which must be below the Nth bit
+      // bit mentioned above.
+      nimms |= (cto-1);
+
+      // Extract the seventh bit and toggle it to create the N field.
+      unsigned N = ((nimms >> 6) & 1) ^ 1;
+
+      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding;
+  return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding = 0;
+  bool res = processLogicalImmediate(imm, regSize, encoding);
+  assert(res && "invalid logical immediate");
+  (void)res;
+  return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+  // Extract the N, imms, and immr fields.
+  unsigned N = (val >> 12) & 1;
+  unsigned immr = (val >> 6) & 0x3f;
+  unsigned imms = val & 0x3f;
+
+  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  assert(len >= 0 && "undefined logical immediate encoding");
+  unsigned size = (1 << len);
+  unsigned R = immr & (size - 1);
+  unsigned S = imms & (size - 1);
+  assert(S != size - 1 && "undefined logical immediate encoding");
+  uint64_t pattern = (1ULL << (S + 1)) - 1;
+  for (unsigned i = 0; i < R; ++i)
+    pattern = ror(pattern, size);
+
+  // Replicate the pattern to fill the regSize.
+  while (size != regSize) {
+    pattern |= (pattern << size);
+    size *= 2;
+  }
+  return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+                                                 unsigned regSize) {
+  // Extract the N and imms fields needed for checking.
+  unsigned N = (val >> 12) & 1;
+  unsigned imms = val & 0x3f;
+
+  if (regSize == 32 && N != 0) // undefined logical immediate encoding
+    return false;
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  if (len < 0) // undefined logical immediate encoding
+    return false;
+  unsigned size = (1 << len);
+  unsigned S = imms & (size - 1);
+  if (S == size - 1) // undefined logical immediate encoding
+    return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+  // We expect an 8-bit binary encoding of a floating-point number here.
+  union {
+    uint32_t I;
+    float F;
+  } FPUnion;
+
+  uint8_t Sign = (Imm >> 7) & 0x1;
+  uint8_t Exp = (Imm >> 4) & 0x7;
+  uint8_t Mantissa = Imm & 0xf;
+
+  //   8-bit FP    iEEEE Float Encoding
+  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+  //
+  // where B = NOT(b);
+
+  FPUnion.I = 0;
+  FPUnion.I |= Sign << 31;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+  FPUnion.I |= (Exp & 0x3) << 23;
+  FPUnion.I |= Mantissa << 19;
+  return FPUnion.F;
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+  return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffULL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+  return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+  return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+  return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+  return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+         ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  EncVal |= (EncVal << 8);
+  EncVal |= (EncVal << 16);
+  EncVal |= (EncVal << 32);
+  return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+  uint64_t ByteA = Imm & 0xff00000000000000ULL;
+  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+  uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+  return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (BString == 0x1f || BString == 0x20) &&
+         ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x80000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+  else            EncVal |= 0x40000000ULL;
+  if (Imm & 0x20) EncVal |= 0x01000000ULL;
+  if (Imm & 0x10) EncVal |= 0x00800000ULL;
+  if (Imm & 0x08) EncVal |= 0x00400000ULL;
+  if (Imm & 0x04) EncVal |= 0x00200000ULL;
+  if (Imm & 0x02) EncVal |= 0x00100000ULL;
+  if (Imm & 0x01) EncVal |= 0x00080000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+  return ((BString == 0xff || BString == 0x100) &&
+         ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+  else            EncVal |= 0x4000000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+} // end namespace ARM64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
new file mode 100644
index 0000000..26813e2
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
@@ -0,0 +1,533 @@
+//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64.h"
+#include "ARM64RegisterInfo.h"
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+
+class ARM64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+
+public:
+  ARM64AsmBackend(const Target &T) : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const { return ARM64::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // ARM64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_arm64_add_imm12", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_arm64_movw", 5, 16, 0 },
+      { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_arm64_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_arm64_tlsdesc_call", 0, 0, 0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+  unsigned getPointerSize() const { return 8; }
+};
+
+} // end anonymous namespace
+
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    assert(0 && "Unknown fixup kind!");
+
+  case ARM64::fixup_arm64_tlsdesc_call:
+    return 0;
+
+  case FK_Data_1:
+    return 1;
+
+  case FK_Data_2:
+  case ARM64::fixup_arm64_movw:
+    return 2;
+
+  case ARM64::fixup_arm64_pcrel_branch14:
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+  case ARM64::fixup_arm64_pcrel_imm19:
+    return 3;
+
+  case ARM64::fixup_arm64_pcrel_adr_imm21:
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
+
+  case FK_Data_8:
+    return 8;
+  }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
+  switch (Kind) {
+  default:
+    assert(false && "Unknown fixup kind!");
+  case ARM64::fixup_arm64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case ARM64::fixup_arm64_pcrel_imm19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value;
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case ARM64::fixup_arm64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+    return Value;
+  case ARM64::fixup_arm64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  }
+}
+
+void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                 unsigned DataSize, uint64_t Value,
+                                 bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                           const MCRelaxableFragment *DF,
+                                           const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for ARM64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
+  }
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_ARM64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_ARM64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinARM64AsmBackend : public ARM64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : ARM64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                       MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
+  }
+
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const
+      override {
+    if (Instrs.empty())
+      return CU::UNWIND_ARM64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_ARM64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   ARM64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == ARM64::LR && FPReg == ARM64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_ARM64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
+    }
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_ARM64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFARM64AsmBackend : public ARM64AsmBackend {
+public:
+  uint8_t OSABI;
+
+  ELFARM64AsmBackend(const Target &T, uint8_t OSABI)
+      : ARM64AsmBackend(T), OSABI(OSABI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARM64ELFObjectWriter(OS, OSABI);
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+};
+
+void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFixup &Fixup,
+                                           const MCFragment *DF,
+                                           const MCValue &Target,
+                                           uint64_t &Value, bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+}
+
+MCAsmBackend *llvm::createARM64AsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinARM64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFARM64AsmBackend(T, TheTriple.getOS());
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
new file mode 100644
index 0000000..d3c2cf7
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
@@ -0,0 +1,998 @@
+//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64BASEINFO_H
+#define ARM64BASEINFO_H
+
+#include "ARM64MCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::X0: return ARM64::W0;
+  case ARM64::X1: return ARM64::W1;
+  case ARM64::X2: return ARM64::W2;
+  case ARM64::X3: return ARM64::W3;
+  case ARM64::X4: return ARM64::W4;
+  case ARM64::X5: return ARM64::W5;
+  case ARM64::X6: return ARM64::W6;
+  case ARM64::X7: return ARM64::W7;
+  case ARM64::X8: return ARM64::W8;
+  case ARM64::X9: return ARM64::W9;
+  case ARM64::X10: return ARM64::W10;
+  case ARM64::X11: return ARM64::W11;
+  case ARM64::X12: return ARM64::W12;
+  case ARM64::X13: return ARM64::W13;
+  case ARM64::X14: return ARM64::W14;
+  case ARM64::X15: return ARM64::W15;
+  case ARM64::X16: return ARM64::W16;
+  case ARM64::X17: return ARM64::W17;
+  case ARM64::X18: return ARM64::W18;
+  case ARM64::X19: return ARM64::W19;
+  case ARM64::X20: return ARM64::W20;
+  case ARM64::X21: return ARM64::W21;
+  case ARM64::X22: return ARM64::W22;
+  case ARM64::X23: return ARM64::W23;
+  case ARM64::X24: return ARM64::W24;
+  case ARM64::X25: return ARM64::W25;
+  case ARM64::X26: return ARM64::W26;
+  case ARM64::X27: return ARM64::W27;
+  case ARM64::X28: return ARM64::W28;
+  case ARM64::FP: return ARM64::W29;
+  case ARM64::LR: return ARM64::W30;
+  case ARM64::SP: return ARM64::WSP;
+  case ARM64::XZR: return ARM64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::W0: return ARM64::X0;
+  case ARM64::W1: return ARM64::X1;
+  case ARM64::W2: return ARM64::X2;
+  case ARM64::W3: return ARM64::X3;
+  case ARM64::W4: return ARM64::X4;
+  case ARM64::W5: return ARM64::X5;
+  case ARM64::W6: return ARM64::X6;
+  case ARM64::W7: return ARM64::X7;
+  case ARM64::W8: return ARM64::X8;
+  case ARM64::W9: return ARM64::X9;
+  case ARM64::W10: return ARM64::X10;
+  case ARM64::W11: return ARM64::X11;
+  case ARM64::W12: return ARM64::X12;
+  case ARM64::W13: return ARM64::X13;
+  case ARM64::W14: return ARM64::X14;
+  case ARM64::W15: return ARM64::X15;
+  case ARM64::W16: return ARM64::X16;
+  case ARM64::W17: return ARM64::X17;
+  case ARM64::W18: return ARM64::X18;
+  case ARM64::W19: return ARM64::X19;
+  case ARM64::W20: return ARM64::X20;
+  case ARM64::W21: return ARM64::X21;
+  case ARM64::W22: return ARM64::X22;
+  case ARM64::W23: return ARM64::X23;
+  case ARM64::W24: return ARM64::X24;
+  case ARM64::W25: return ARM64::X25;
+  case ARM64::W26: return ARM64::X26;
+  case ARM64::W27: return ARM64::X27;
+  case ARM64::W28: return ARM64::X28;
+  case ARM64::W29: return ARM64::FP;
+  case ARM64::W30: return ARM64::LR;
+  case ARM64::WSP: return ARM64::SP;
+  case ARM64::WZR: return ARM64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::D0:  return ARM64::B0;
+  case ARM64::D1:  return ARM64::B1;
+  case ARM64::D2:  return ARM64::B2;
+  case ARM64::D3:  return ARM64::B3;
+  case ARM64::D4:  return ARM64::B4;
+  case ARM64::D5:  return ARM64::B5;
+  case ARM64::D6:  return ARM64::B6;
+  case ARM64::D7:  return ARM64::B7;
+  case ARM64::D8:  return ARM64::B8;
+  case ARM64::D9:  return ARM64::B9;
+  case ARM64::D10: return ARM64::B10;
+  case ARM64::D11: return ARM64::B11;
+  case ARM64::D12: return ARM64::B12;
+  case ARM64::D13: return ARM64::B13;
+  case ARM64::D14: return ARM64::B14;
+  case ARM64::D15: return ARM64::B15;
+  case ARM64::D16: return ARM64::B16;
+  case ARM64::D17: return ARM64::B17;
+  case ARM64::D18: return ARM64::B18;
+  case ARM64::D19: return ARM64::B19;
+  case ARM64::D20: return ARM64::B20;
+  case ARM64::D21: return ARM64::B21;
+  case ARM64::D22: return ARM64::B22;
+  case ARM64::D23: return ARM64::B23;
+  case ARM64::D24: return ARM64::B24;
+  case ARM64::D25: return ARM64::B25;
+  case ARM64::D26: return ARM64::B26;
+  case ARM64::D27: return ARM64::B27;
+  case ARM64::D28: return ARM64::B28;
+  case ARM64::D29: return ARM64::B29;
+  case ARM64::D30: return ARM64::B30;
+  case ARM64::D31: return ARM64::B31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case ARM64::B0:  return ARM64::D0;
+  case ARM64::B1:  return ARM64::D1;
+  case ARM64::B2:  return ARM64::D2;
+  case ARM64::B3:  return ARM64::D3;
+  case ARM64::B4:  return ARM64::D4;
+  case ARM64::B5:  return ARM64::D5;
+  case ARM64::B6:  return ARM64::D6;
+  case ARM64::B7:  return ARM64::D7;
+  case ARM64::B8:  return ARM64::D8;
+  case ARM64::B9:  return ARM64::D9;
+  case ARM64::B10: return ARM64::D10;
+  case ARM64::B11: return ARM64::D11;
+  case ARM64::B12: return ARM64::D12;
+  case ARM64::B13: return ARM64::D13;
+  case ARM64::B14: return ARM64::D14;
+  case ARM64::B15: return ARM64::D15;
+  case ARM64::B16: return ARM64::D16;
+  case ARM64::B17: return ARM64::D17;
+  case ARM64::B18: return ARM64::D18;
+  case ARM64::B19: return ARM64::D19;
+  case ARM64::B20: return ARM64::D20;
+  case ARM64::B21: return ARM64::D21;
+  case ARM64::B22: return ARM64::D22;
+  case ARM64::B23: return ARM64::D23;
+  case ARM64::B24: return ARM64::D24;
+  case ARM64::B25: return ARM64::D25;
+  case ARM64::B26: return ARM64::D26;
+  case ARM64::B27: return ARM64::D27;
+  case ARM64::B28: return ARM64::D28;
+  case ARM64::B29: return ARM64::D29;
+  case ARM64::B30: return ARM64::D30;
+  case ARM64::B31: return ARM64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+namespace ARM64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  CS = 0x2,      // Carry set                  >, ==, or unordered
+  CC = 0x3,      // Carry clear                Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe       // Always (unconditional)     Always (unconditional)
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+  // cond<0> is ignored when cond<3:1> = 111, where 1110 is 0xe (aka AL).
+  if ((Code & AL) == AL)
+    Code = AL;
+  switch (Code) {
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case CS:  return "cs";
+  case CC:  return "cc";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return NE;
+  case NE:  return EQ;
+  case CS:  return CC;
+  case CC:  return CS;
+  case MI:  return PL;
+  case PL:  return MI;
+  case VS:  return VC;
+  case VC:  return VS;
+  case HI:  return LS;
+  case LS:  return HI;
+  case GE:  return LT;
+  case LT:  return GE;
+  case GT:  return LE;
+  case LE:  return GT;
+  }
+}
+
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case CS: return C; // C == 1
+  case CC: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace ARM64CC
+
+namespace ARM64SYS {
+enum BarrierOption {
+  InvalidBarrier = 0xff,
+  OSHLD = 0x1,
+  OSHST = 0x2,
+  OSH =   0x3,
+  NSHLD = 0x5,
+  NSHST = 0x6,
+  NSH =   0x7,
+  ISHLD = 0x9,
+  ISHST = 0xa,
+  ISH =   0xb,
+  LD =    0xd,
+  ST =    0xe,
+  SY =    0xf
+};
+
+inline static const char *getBarrierOptName(BarrierOption Opt) {
+  switch (Opt) {
+  default: return NULL;
+  case 0x1: return "oshld";
+  case 0x2: return "oshst";
+  case 0x3: return "osh";
+  case 0x5: return "nshld";
+  case 0x6: return "nshst";
+  case 0x7: return "nsh";
+  case 0x9: return "ishld";
+  case 0xa: return "ishst";
+  case 0xb: return "ish";
+  case 0xd: return "ld";
+  case 0xe: return "st";
+  case 0xf: return "sy";
+  }
+}
+
+#define A64_SYSREG_ENC(op0,CRn,op2,CRm,op1) ((op0) << 14 | (op1) << 11 | \
+                                             (CRn) << 7  | (CRm) << 3 | (op2))
+enum SystemRegister {
+  InvalidSystemReg = 0,
+  // Table in section 3.10.3
+  SPSR_EL1  = 0xc200,
+  SPSR_svc  = SPSR_EL1,
+  ELR_EL1   = 0xc201,
+  SP_EL0    = 0xc208,
+  SPSel     = 0xc210,
+  CurrentEL = 0xc212,
+  DAIF      = 0xda11,
+  NZCV      = 0xda10,
+  FPCR      = 0xda20,
+  FPSR      = 0xda21,
+  DSPSR     = 0xda28,
+  DLR       = 0xda29,
+  SPSR_EL2  = 0xe200,
+  SPSR_hyp  = SPSR_EL2,
+  ELR_EL2   = 0xe201,
+  SP_EL1    = 0xe208,
+  SPSR_irq  = 0xe218,
+  SPSR_abt  = 0xe219,
+  SPSR_und  = 0xe21a,
+  SPSR_fiq  = 0xe21b,
+  SPSR_EL3  = 0xf200,
+  ELR_EL3   = 0xf201,
+  SP_EL2    = 0xf208,
+
+
+  // Table in section 3.10.8
+  MIDR_EL1 = 0xc000,
+  CTR_EL0 = 0xd801,
+  MPIDR_EL1 = 0xc005,
+  ECOIDR_EL1 = 0xc006,
+  DCZID_EL0 = 0xd807,
+  MVFR0_EL1 = 0xc018,
+  MVFR1_EL1 = 0xc019,
+  ID_AA64PFR0_EL1 = 0xc020,
+  ID_AA64PFR1_EL1 = 0xc021,
+  ID_AA64DFR0_EL1 = 0xc028,
+  ID_AA64DFR1_EL1 = 0xc029,
+  ID_AA64ISAR0_EL1 = 0xc030,
+  ID_AA64ISAR1_EL1 = 0xc031,
+  ID_AA64MMFR0_EL1 = 0xc038,
+  ID_AA64MMFR1_EL1 = 0xc039,
+  CCSIDR_EL1 = 0xc800,
+  CLIDR_EL1 = 0xc801,
+  AIDR_EL1 = 0xc807,
+  CSSELR_EL1 = 0xd000,
+  VPIDR_EL2 = 0xe000,
+  VMPIDR_EL2 = 0xe005,
+  SCTLR_EL1 = 0xc080,
+  SCTLR_EL2 = 0xe080,
+  SCTLR_EL3 = 0xf080,
+  ACTLR_EL1 = 0xc081,
+  ACTLR_EL2 = 0xe081,
+  ACTLR_EL3 = 0xf081,
+  CPACR_EL1 = 0xc082,
+  CPTR_EL2 = 0xe08a,
+  CPTR_EL3 = 0xf08a,
+  SCR_EL3 = 0xf088,
+  HCR_EL2 = 0xe088,
+  MDCR_EL2 = 0xe089,
+  MDCR_EL3 = 0xf099,
+  HSTR_EL2 = 0xe08b,
+  HACR_EL2 = 0xe08f,
+  TTBR0_EL1 = 0xc100,
+  TTBR1_EL1 = 0xc101,
+  TTBR0_EL2 = 0xe100,
+  TTBR0_EL3 = 0xf100,
+  VTTBR_EL2 = 0xe108,
+  TCR_EL1 = 0xc102,
+  TCR_EL2 = 0xe102,
+  TCR_EL3 = 0xf102,
+  VTCR_EL2 = 0xe10a,
+  ADFSR_EL1 = 0xc288,
+  AIFSR_EL1 = 0xc289,
+  ADFSR_EL2 = 0xe288,
+  AIFSR_EL2 = 0xe289,
+  ADFSR_EL3 = 0xf288,
+  AIFSR_EL3 = 0xf289,
+  ESR_EL1 = 0xc290,
+  ESR_EL2 = 0xe290,
+  ESR_EL3 = 0xf290,
+  FAR_EL1 = 0xc300,
+  FAR_EL2 = 0xe300,
+  FAR_EL3 = 0xf300,
+  HPFAR_EL2 = 0xe304,
+  PAR_EL1 = 0xc3a0,
+  MAIR_EL1 = 0xc510,
+  MAIR_EL2 = 0xe510,
+  MAIR_EL3 = 0xf510,
+  AMAIR_EL1 = 0xc518,
+  AMAIR_EL2 = 0xe518,
+  AMAIR_EL3 = 0xf518,
+  VBAR_EL1 = 0xc600,
+  VBAR_EL2 = 0xe600,
+  VBAR_EL3 = 0xf600,
+  RVBAR_EL1 = 0xc601,
+  RVBAR_EL2 = 0xe601,
+  RVBAR_EL3 = 0xf601,
+  ISR_EL1 = 0xc608,
+  CONTEXTIDR_EL1 = 0xc681,
+  TPIDR_EL0 = 0xde82,
+  TPIDRRO_EL0 = 0xde83,
+  TPIDR_EL1 = 0xc684,
+  TPIDR_EL2 = 0xe682,
+  TPIDR_EL3 = 0xf682,
+  TEECR32_EL1 = 0x9000,
+  CNTFRQ_EL0 = 0xdf00,
+  CNTPCT_EL0 = 0xdf01,
+  CNTVCT_EL0 = 0xdf02,
+  CNTVOFF_EL2 = 0xe703,
+  CNTKCTL_EL1 = 0xc708,
+  CNTHCTL_EL2 = 0xe708,
+  CNTP_TVAL_EL0 = 0xdf10,
+  CNTP_CTL_EL0 = 0xdf11,
+  CNTP_CVAL_EL0 = 0xdf12,
+  CNTV_TVAL_EL0 = 0xdf18,
+  CNTV_CTL_EL0 = 0xdf19,
+  CNTV_CVAL_EL0 = 0xdf1a,
+  CNTHP_TVAL_EL2 = 0xe710,
+  CNTHP_CTL_EL2 = 0xe711,
+  CNTHP_CVAL_EL2 = 0xe712,
+  CNTPS_TVAL_EL1 = 0xff10,
+  CNTPS_CTL_EL1 = 0xff11,
+  CNTPS_CVAL_EL1= 0xff12,
+
+  PMEVCNTR0_EL0  = 0xdf40,
+  PMEVCNTR1_EL0  = 0xdf41,
+  PMEVCNTR2_EL0  = 0xdf42,
+  PMEVCNTR3_EL0  = 0xdf43,
+  PMEVCNTR4_EL0  = 0xdf44,
+  PMEVCNTR5_EL0  = 0xdf45,
+  PMEVCNTR6_EL0  = 0xdf46,
+  PMEVCNTR7_EL0  = 0xdf47,
+  PMEVCNTR8_EL0  = 0xdf48,
+  PMEVCNTR9_EL0  = 0xdf49,
+  PMEVCNTR10_EL0 = 0xdf4a,
+  PMEVCNTR11_EL0 = 0xdf4b,
+  PMEVCNTR12_EL0 = 0xdf4c,
+  PMEVCNTR13_EL0 = 0xdf4d,
+  PMEVCNTR14_EL0 = 0xdf4e,
+  PMEVCNTR15_EL0 = 0xdf4f,
+  PMEVCNTR16_EL0 = 0xdf50,
+  PMEVCNTR17_EL0 = 0xdf51,
+  PMEVCNTR18_EL0 = 0xdf52,
+  PMEVCNTR19_EL0 = 0xdf53,
+  PMEVCNTR20_EL0 = 0xdf54,
+  PMEVCNTR21_EL0 = 0xdf55,
+  PMEVCNTR22_EL0 = 0xdf56,
+  PMEVCNTR23_EL0 = 0xdf57,
+  PMEVCNTR24_EL0 = 0xdf58,
+  PMEVCNTR25_EL0 = 0xdf59,
+  PMEVCNTR26_EL0 = 0xdf5a,
+  PMEVCNTR27_EL0 = 0xdf5b,
+  PMEVCNTR28_EL0 = 0xdf5c,
+  PMEVCNTR29_EL0 = 0xdf5d,
+  PMEVCNTR30_EL0 = 0xdf5e,
+
+  PMEVTYPER0_EL0  = 0xdf60,
+  PMEVTYPER1_EL0  = 0xdf61,
+  PMEVTYPER2_EL0  = 0xdf62,
+  PMEVTYPER3_EL0  = 0xdf63,
+  PMEVTYPER4_EL0  = 0xdf64,
+  PMEVTYPER5_EL0  = 0xdf65,
+  PMEVTYPER6_EL0  = 0xdf66,
+  PMEVTYPER7_EL0  = 0xdf67,
+  PMEVTYPER8_EL0  = 0xdf68,
+  PMEVTYPER9_EL0  = 0xdf69,
+  PMEVTYPER10_EL0 = 0xdf6a,
+  PMEVTYPER11_EL0 = 0xdf6b,
+  PMEVTYPER12_EL0 = 0xdf6c,
+  PMEVTYPER13_EL0 = 0xdf6d,
+  PMEVTYPER14_EL0 = 0xdf6e,
+  PMEVTYPER15_EL0 = 0xdf6f,
+  PMEVTYPER16_EL0 = 0xdf70,
+  PMEVTYPER17_EL0 = 0xdf71,
+  PMEVTYPER18_EL0 = 0xdf72,
+  PMEVTYPER19_EL0 = 0xdf73,
+  PMEVTYPER20_EL0 = 0xdf74,
+  PMEVTYPER21_EL0 = 0xdf75,
+  PMEVTYPER22_EL0 = 0xdf76,
+  PMEVTYPER23_EL0 = 0xdf77,
+  PMEVTYPER24_EL0 = 0xdf78,
+  PMEVTYPER25_EL0 = 0xdf79,
+  PMEVTYPER26_EL0 = 0xdf7a,
+  PMEVTYPER27_EL0 = 0xdf7b,
+  PMEVTYPER28_EL0 = 0xdf7c,
+  PMEVTYPER29_EL0 = 0xdf7d,
+  PMEVTYPER30_EL0 = 0xdf7e,
+
+  PMCCFILTR_EL0  = 0xdf7f,
+
+  RMR_EL3 = 0xf602,
+  RMR_EL2 = 0xd602,
+  RMR_EL1 = 0xce02,
+
+  // Debug Architecture 5.3, Table 17.
+  MDCCSR_EL0   = A64_SYSREG_ENC(2, 0, 0, 1, 3),
+  MDCCINT_EL1  = A64_SYSREG_ENC(2, 0, 0, 2, 0),
+  DBGDTR_EL0   = A64_SYSREG_ENC(2, 0, 0, 4, 3),
+  DBGDTRRX_EL0 = A64_SYSREG_ENC(2, 0, 0, 5, 3),
+  DBGDTRTX_EL0 = DBGDTRRX_EL0,
+  DBGVCR32_EL2 = A64_SYSREG_ENC(2, 0, 0, 7, 4),
+  OSDTRRX_EL1  = A64_SYSREG_ENC(2, 0, 2, 0, 0),
+  MDSCR_EL1    = A64_SYSREG_ENC(2, 0, 2, 2, 0),
+  OSDTRTX_EL1  = A64_SYSREG_ENC(2, 0, 2, 3, 0),
+  OSECCR_EL11  = A64_SYSREG_ENC(2, 0, 2, 6, 0),
+
+  DBGBVR0_EL1  = A64_SYSREG_ENC(2, 0, 4, 0, 0),
+  DBGBVR1_EL1  = A64_SYSREG_ENC(2, 0, 4, 1, 0),
+  DBGBVR2_EL1  = A64_SYSREG_ENC(2, 0, 4, 2, 0),
+  DBGBVR3_EL1  = A64_SYSREG_ENC(2, 0, 4, 3, 0),
+  DBGBVR4_EL1  = A64_SYSREG_ENC(2, 0, 4, 4, 0),
+  DBGBVR5_EL1  = A64_SYSREG_ENC(2, 0, 4, 5, 0),
+  DBGBVR6_EL1  = A64_SYSREG_ENC(2, 0, 4, 6, 0),
+  DBGBVR7_EL1  = A64_SYSREG_ENC(2, 0, 4, 7, 0),
+  DBGBVR8_EL1  = A64_SYSREG_ENC(2, 0, 4, 8, 0),
+  DBGBVR9_EL1  = A64_SYSREG_ENC(2, 0, 4, 9, 0),
+  DBGBVR10_EL1 = A64_SYSREG_ENC(2, 0, 4, 10, 0),
+  DBGBVR11_EL1 = A64_SYSREG_ENC(2, 0, 4, 11, 0),
+  DBGBVR12_EL1 = A64_SYSREG_ENC(2, 0, 4, 12, 0),
+  DBGBVR13_EL1 = A64_SYSREG_ENC(2, 0, 4, 13, 0),
+  DBGBVR14_EL1 = A64_SYSREG_ENC(2, 0, 4, 14, 0),
+  DBGBVR15_EL1 = A64_SYSREG_ENC(2, 0, 4, 15, 0),
+
+  DBGBCR0_EL1  = A64_SYSREG_ENC(2, 0, 5, 0, 0),
+  DBGBCR1_EL1  = A64_SYSREG_ENC(2, 0, 5, 1, 0),
+  DBGBCR2_EL1  = A64_SYSREG_ENC(2, 0, 5, 2, 0),
+  DBGBCR3_EL1  = A64_SYSREG_ENC(2, 0, 5, 3, 0),
+  DBGBCR4_EL1  = A64_SYSREG_ENC(2, 0, 5, 4, 0),
+  DBGBCR5_EL1  = A64_SYSREG_ENC(2, 0, 5, 5, 0),
+  DBGBCR6_EL1  = A64_SYSREG_ENC(2, 0, 5, 6, 0),
+  DBGBCR7_EL1  = A64_SYSREG_ENC(2, 0, 5, 7, 0),
+  DBGBCR8_EL1  = A64_SYSREG_ENC(2, 0, 5, 8, 0),
+  DBGBCR9_EL1  = A64_SYSREG_ENC(2, 0, 5, 9, 0),
+  DBGBCR10_EL1 = A64_SYSREG_ENC(2, 0, 5, 10, 0),
+  DBGBCR11_EL1 = A64_SYSREG_ENC(2, 0, 5, 11, 0),
+  DBGBCR12_EL1 = A64_SYSREG_ENC(2, 0, 5, 12, 0),
+  DBGBCR13_EL1 = A64_SYSREG_ENC(2, 0, 5, 13, 0),
+  DBGBCR14_EL1 = A64_SYSREG_ENC(2, 0, 5, 14, 0),
+  DBGBCR15_EL1 = A64_SYSREG_ENC(2, 0, 5, 15, 0),
+
+  DBGWVR0_EL1  = A64_SYSREG_ENC(2, 0, 6, 0, 0),
+  DBGWVR1_EL1  = A64_SYSREG_ENC(2, 0, 6, 1, 0),
+  DBGWVR2_EL1  = A64_SYSREG_ENC(2, 0, 6, 2, 0),
+  DBGWVR3_EL1  = A64_SYSREG_ENC(2, 0, 6, 3, 0),
+  DBGWVR4_EL1  = A64_SYSREG_ENC(2, 0, 6, 4, 0),
+  DBGWVR5_EL1  = A64_SYSREG_ENC(2, 0, 6, 5, 0),
+  DBGWVR6_EL1  = A64_SYSREG_ENC(2, 0, 6, 6, 0),
+  DBGWVR7_EL1  = A64_SYSREG_ENC(2, 0, 6, 7, 0),
+  DBGWVR8_EL1  = A64_SYSREG_ENC(2, 0, 6, 8, 0),
+  DBGWVR9_EL1  = A64_SYSREG_ENC(2, 0, 6, 9, 0),
+  DBGWVR10_EL1 = A64_SYSREG_ENC(2, 0, 6, 10, 0),
+  DBGWVR11_EL1 = A64_SYSREG_ENC(2, 0, 6, 11, 0),
+  DBGWVR12_EL1 = A64_SYSREG_ENC(2, 0, 6, 12, 0),
+  DBGWVR13_EL1 = A64_SYSREG_ENC(2, 0, 6, 13, 0),
+  DBGWVR14_EL1 = A64_SYSREG_ENC(2, 0, 6, 14, 0),
+  DBGWVR15_EL1 = A64_SYSREG_ENC(2, 0, 6, 15, 0),
+
+  DBGWCR0_EL1  = A64_SYSREG_ENC(2, 0, 7, 0, 0),
+  DBGWCR1_EL1  = A64_SYSREG_ENC(2, 0, 7, 1, 0),
+  DBGWCR2_EL1  = A64_SYSREG_ENC(2, 0, 7, 2, 0),
+  DBGWCR3_EL1  = A64_SYSREG_ENC(2, 0, 7, 3, 0),
+  DBGWCR4_EL1  = A64_SYSREG_ENC(2, 0, 7, 4, 0),
+  DBGWCR5_EL1  = A64_SYSREG_ENC(2, 0, 7, 5, 0),
+  DBGWCR6_EL1  = A64_SYSREG_ENC(2, 0, 7, 6, 0),
+  DBGWCR7_EL1  = A64_SYSREG_ENC(2, 0, 7, 7, 0),
+  DBGWCR8_EL1  = A64_SYSREG_ENC(2, 0, 7, 8, 0),
+  DBGWCR9_EL1  = A64_SYSREG_ENC(2, 0, 7, 9, 0),
+  DBGWCR10_EL1 = A64_SYSREG_ENC(2, 0, 7, 10, 0),
+  DBGWCR11_EL1 = A64_SYSREG_ENC(2, 0, 7, 11, 0),
+  DBGWCR12_EL1 = A64_SYSREG_ENC(2, 0, 7, 12, 0),
+  DBGWCR13_EL1 = A64_SYSREG_ENC(2, 0, 7, 13, 0),
+  DBGWCR14_EL1 = A64_SYSREG_ENC(2, 0, 7, 14, 0),
+  DBGWCR15_EL1 = A64_SYSREG_ENC(2, 0, 7, 15, 0),
+
+  MDRAR_EL1    = A64_SYSREG_ENC(2, 1, 0, 0, 0),
+  OSLAR_EL1    = A64_SYSREG_ENC(2, 1, 4, 0, 0),
+  OSLSR_EL1    = A64_SYSREG_ENC(2, 1, 4, 1, 0),
+  OSDLR_EL1    = A64_SYSREG_ENC(2, 1, 4, 3, 0),
+  DBGPRCR_EL1  = A64_SYSREG_ENC(2, 1, 4, 4, 0),
+
+  DBGCLAIMSET_EL1   = A64_SYSREG_ENC(2, 7, 6, 8, 0),
+  DBGCLAIMCLR_EL1   = A64_SYSREG_ENC(2, 7, 6, 9, 0),
+  DBGAUTHSTATUS_EL1 = A64_SYSREG_ENC(2, 7, 6, 14, 0),
+
+  DBGDEVID2    = A64_SYSREG_ENC(2, 7, 7, 0, 0),
+  DBGDEVID1    = A64_SYSREG_ENC(2, 7, 7, 1, 0),
+  DBGDEVID0    = A64_SYSREG_ENC(2, 7, 7, 2, 0),
+
+  // The following registers are defined to allow access from AArch64 to
+  // registers which are only used in the AArch32 architecture.
+  DACR32_EL2 = 0xe180,
+  IFSR32_EL2 = 0xe281,
+  TEEHBR32_EL1 = 0x9080,
+  SDER32_EL3 = 0xf089,
+  FPEXC32_EL2 = 0xe298,
+
+  // Cyclone specific system registers
+  CPM_IOACC_CTL_EL3 = 0xff90,
+
+  // Architectural system registers
+  ID_PFR0_EL1 = 0xc008,
+  ID_PFR1_EL1 = 0xc009,
+  ID_DFR0_EL1 = 0xc00a,
+  ID_AFR0_EL1 = 0xc00b,
+  ID_ISAR0_EL1 = 0xc010,
+  ID_ISAR1_EL1 = 0xc011,
+  ID_ISAR2_EL1 = 0xc012,
+  ID_ISAR3_EL1 = 0xc013,
+  ID_ISAR4_EL1 = 0xc014,
+  ID_ISAR5_EL1 = 0xc015,
+  AFSR1_EL1 = 0xc289, // note same as old AIFSR_EL1
+  AFSR0_EL1 = 0xc288, // note same as old ADFSR_EL1
+  REVIDR_EL1 = 0xc006 // note same as old ECOIDR_EL1
+
+};
+#undef A64_SYSREG_ENC
+
+static inline const char *getSystemRegisterName(SystemRegister Reg) {
+  switch(Reg) {
+  default: return NULL; // Caller is responsible for handling invalid value.
+  case SPSR_EL1: return "SPSR_EL1";
+  case ELR_EL1: return "ELR_EL1";
+  case SP_EL0: return "SP_EL0";
+  case SPSel: return "SPSel";
+  case DAIF: return "DAIF";
+  case CurrentEL: return "CurrentEL";
+  case NZCV: return "NZCV";
+  case FPCR: return "FPCR";
+  case FPSR: return "FPSR";
+  case DSPSR: return "DSPSR";
+  case DLR: return "DLR";
+  case SPSR_EL2: return "SPSR_EL2";
+  case ELR_EL2: return "ELR_EL2";
+  case SP_EL1: return "SP_EL1";
+  case SPSR_irq: return "SPSR_irq";
+  case SPSR_abt: return "SPSR_abt";
+  case SPSR_und: return "SPSR_und";
+  case SPSR_fiq: return "SPSR_fiq";
+  case SPSR_EL3: return "SPSR_EL3";
+  case ELR_EL3: return "ELR_EL3";
+  case SP_EL2: return "SP_EL2";
+  case MIDR_EL1: return "MIDR_EL1";
+  case CTR_EL0: return "CTR_EL0";
+  case MPIDR_EL1: return "MPIDR_EL1";
+  case DCZID_EL0: return "DCZID_EL0";
+  case MVFR0_EL1: return "MVFR0_EL1";
+  case MVFR1_EL1: return "MVFR1_EL1";
+  case ID_AA64PFR0_EL1: return "ID_AA64PFR0_EL1";
+  case ID_AA64PFR1_EL1: return "ID_AA64PFR1_EL1";
+  case ID_AA64DFR0_EL1: return "ID_AA64DFR0_EL1";
+  case ID_AA64DFR1_EL1: return "ID_AA64DFR1_EL1";
+  case ID_AA64ISAR0_EL1: return "ID_AA64ISAR0_EL1";
+  case ID_AA64ISAR1_EL1: return "ID_AA64ISAR1_EL1";
+  case ID_AA64MMFR0_EL1: return "ID_AA64MMFR0_EL1";
+  case ID_AA64MMFR1_EL1: return "ID_AA64MMFR1_EL1";
+  case CCSIDR_EL1: return "CCSIDR_EL1";
+  case CLIDR_EL1: return "CLIDR_EL1";
+  case AIDR_EL1: return "AIDR_EL1";
+  case CSSELR_EL1: return "CSSELR_EL1";
+  case VPIDR_EL2: return "VPIDR_EL2";
+  case VMPIDR_EL2: return "VMPIDR_EL2";
+  case SCTLR_EL1: return "SCTLR_EL1";
+  case SCTLR_EL2: return "SCTLR_EL2";
+  case SCTLR_EL3: return "SCTLR_EL3";
+  case ACTLR_EL1: return "ACTLR_EL1";
+  case ACTLR_EL2: return "ACTLR_EL2";
+  case ACTLR_EL3: return "ACTLR_EL3";
+  case CPACR_EL1: return "CPACR_EL1";
+  case CPTR_EL2: return "CPTR_EL2";
+  case CPTR_EL3: return "CPTR_EL3";
+  case SCR_EL3: return "SCR_EL3";
+  case HCR_EL2: return "HCR_EL2";
+  case MDCR_EL2: return "MDCR_EL2";
+  case MDCR_EL3: return "MDCR_EL3";
+  case HSTR_EL2: return "HSTR_EL2";
+  case HACR_EL2: return "HACR_EL2";
+  case TTBR0_EL1: return "TTBR0_EL1";
+  case TTBR1_EL1: return "TTBR1_EL1";
+  case TTBR0_EL2: return "TTBR0_EL2";
+  case TTBR0_EL3: return "TTBR0_EL3";
+  case VTTBR_EL2: return "VTTBR_EL2";
+  case TCR_EL1: return "TCR_EL1";
+  case TCR_EL2: return "TCR_EL2";
+  case TCR_EL3: return "TCR_EL3";
+  case VTCR_EL2: return "VTCR_EL2";
+  case ADFSR_EL2: return "ADFSR_EL2";
+  case AIFSR_EL2: return "AIFSR_EL2";
+  case ADFSR_EL3: return "ADFSR_EL3";
+  case AIFSR_EL3: return "AIFSR_EL3";
+  case ESR_EL1: return "ESR_EL1";
+  case ESR_EL2: return "ESR_EL2";
+  case ESR_EL3: return "ESR_EL3";
+  case FAR_EL1: return "FAR_EL1";
+  case FAR_EL2: return "FAR_EL2";
+  case FAR_EL3: return "FAR_EL3";
+  case HPFAR_EL2: return "HPFAR_EL2";
+  case PAR_EL1: return "PAR_EL1";
+  case MAIR_EL1: return "MAIR_EL1";
+  case MAIR_EL2: return "MAIR_EL2";
+  case MAIR_EL3: return "MAIR_EL3";
+  case AMAIR_EL1: return "AMAIR_EL1";
+  case AMAIR_EL2: return "AMAIR_EL2";
+  case AMAIR_EL3: return "AMAIR_EL3";
+  case VBAR_EL1: return "VBAR_EL1";
+  case VBAR_EL2: return "VBAR_EL2";
+  case VBAR_EL3: return "VBAR_EL3";
+  case RVBAR_EL1: return "RVBAR_EL1";
+  case RVBAR_EL2: return "RVBAR_EL2";
+  case RVBAR_EL3: return "RVBAR_EL3";
+  case ISR_EL1: return "ISR_EL1";
+  case CONTEXTIDR_EL1: return "CONTEXTIDR_EL1";
+  case TPIDR_EL0: return "TPIDR_EL0";
+  case TPIDRRO_EL0: return "TPIDRRO_EL0";
+  case TPIDR_EL1: return "TPIDR_EL1";
+  case TPIDR_EL2: return "TPIDR_EL2";
+  case TPIDR_EL3: return "TPIDR_EL3";
+  case TEECR32_EL1: return "TEECR32_EL1";
+  case CNTFRQ_EL0: return "CNTFRQ_EL0";
+  case CNTPCT_EL0: return "CNTPCT_EL0";
+  case CNTVCT_EL0: return "CNTVCT_EL0";
+  case CNTVOFF_EL2: return "CNTVOFF_EL2";
+  case CNTKCTL_EL1: return "CNTKCTL_EL1";
+  case CNTHCTL_EL2: return "CNTHCTL_EL2";
+  case CNTP_TVAL_EL0: return "CNTP_TVAL_EL0";
+  case CNTP_CTL_EL0: return "CNTP_CTL_EL0";
+  case CNTP_CVAL_EL0: return "CNTP_CVAL_EL0";
+  case CNTV_TVAL_EL0: return "CNTV_TVAL_EL0";
+  case CNTV_CTL_EL0: return "CNTV_CTL_EL0";
+  case CNTV_CVAL_EL0: return "CNTV_CVAL_EL0";
+  case CNTHP_TVAL_EL2: return "CNTHP_TVAL_EL2";
+  case CNTHP_CTL_EL2: return "CNTHP_CTL_EL2";
+  case CNTHP_CVAL_EL2: return "CNTHP_CVAL_EL2";
+  case CNTPS_TVAL_EL1: return "CNTPS_TVAL_EL1";
+  case CNTPS_CTL_EL1: return "CNTPS_CTL_EL1";
+  case CNTPS_CVAL_EL1: return "CNTPS_CVAL_EL1";
+  case DACR32_EL2: return "DACR32_EL2";
+  case IFSR32_EL2: return "IFSR32_EL2";
+  case TEEHBR32_EL1: return "TEEHBR32_EL1";
+  case SDER32_EL3: return "SDER32_EL3";
+  case FPEXC32_EL2: return "FPEXC32_EL2";
+  case PMEVCNTR0_EL0: return "PMEVCNTR0_EL0";
+  case PMEVCNTR1_EL0: return "PMEVCNTR1_EL0";
+  case PMEVCNTR2_EL0: return "PMEVCNTR2_EL0";
+  case PMEVCNTR3_EL0: return "PMEVCNTR3_EL0";
+  case PMEVCNTR4_EL0: return "PMEVCNTR4_EL0";
+  case PMEVCNTR5_EL0: return "PMEVCNTR5_EL0";
+  case PMEVCNTR6_EL0: return "PMEVCNTR6_EL0";
+  case PMEVCNTR7_EL0: return "PMEVCNTR7_EL0";
+  case PMEVCNTR8_EL0: return "PMEVCNTR8_EL0";
+  case PMEVCNTR9_EL0: return "PMEVCNTR9_EL0";
+  case PMEVCNTR10_EL0: return "PMEVCNTR10_EL0";
+  case PMEVCNTR11_EL0: return "PMEVCNTR11_EL0";
+  case PMEVCNTR12_EL0: return "PMEVCNTR12_EL0";
+  case PMEVCNTR13_EL0: return "PMEVCNTR13_EL0";
+  case PMEVCNTR14_EL0: return "PMEVCNTR14_EL0";
+  case PMEVCNTR15_EL0: return "PMEVCNTR15_EL0";
+  case PMEVCNTR16_EL0: return "PMEVCNTR16_EL0";
+  case PMEVCNTR17_EL0: return "PMEVCNTR17_EL0";
+  case PMEVCNTR18_EL0: return "PMEVCNTR18_EL0";
+  case PMEVCNTR19_EL0: return "PMEVCNTR19_EL0";
+  case PMEVCNTR20_EL0: return "PMEVCNTR20_EL0";
+  case PMEVCNTR21_EL0: return "PMEVCNTR21_EL0";
+  case PMEVCNTR22_EL0: return "PMEVCNTR22_EL0";
+  case PMEVCNTR23_EL0: return "PMEVCNTR23_EL0";
+  case PMEVCNTR24_EL0: return "PMEVCNTR24_EL0";
+  case PMEVCNTR25_EL0: return "PMEVCNTR25_EL0";
+  case PMEVCNTR26_EL0: return "PMEVCNTR26_EL0";
+  case PMEVCNTR27_EL0: return "PMEVCNTR27_EL0";
+  case PMEVCNTR28_EL0: return "PMEVCNTR28_EL0";
+  case PMEVCNTR29_EL0: return "PMEVCNTR29_EL0";
+  case PMEVCNTR30_EL0: return "PMEVCNTR30_EL0";
+  case PMEVTYPER0_EL0: return "PMEVTYPER0_EL0";
+  case PMEVTYPER1_EL0: return "PMEVTYPER1_EL0";
+  case PMEVTYPER2_EL0: return "PMEVTYPER2_EL0";
+  case PMEVTYPER3_EL0: return "PMEVTYPER3_EL0";
+  case PMEVTYPER4_EL0: return "PMEVTYPER4_EL0";
+  case PMEVTYPER5_EL0: return "PMEVTYPER5_EL0";
+  case PMEVTYPER6_EL0: return "PMEVTYPER6_EL0";
+  case PMEVTYPER7_EL0: return "PMEVTYPER7_EL0";
+  case PMEVTYPER8_EL0: return "PMEVTYPER8_EL0";
+  case PMEVTYPER9_EL0: return "PMEVTYPER9_EL0";
+  case PMEVTYPER10_EL0: return "PMEVTYPER10_EL0";
+  case PMEVTYPER11_EL0: return "PMEVTYPER11_EL0";
+  case PMEVTYPER12_EL0: return "PMEVTYPER12_EL0";
+  case PMEVTYPER13_EL0: return "PMEVTYPER13_EL0";
+  case PMEVTYPER14_EL0: return "PMEVTYPER14_EL0";
+  case PMEVTYPER15_EL0: return "PMEVTYPER15_EL0";
+  case PMEVTYPER16_EL0: return "PMEVTYPER16_EL0";
+  case PMEVTYPER17_EL0: return "PMEVTYPER17_EL0";
+  case PMEVTYPER18_EL0: return "PMEVTYPER18_EL0";
+  case PMEVTYPER19_EL0: return "PMEVTYPER19_EL0";
+  case PMEVTYPER20_EL0: return "PMEVTYPER20_EL0";
+  case PMEVTYPER21_EL0: return "PMEVTYPER21_EL0";
+  case PMEVTYPER22_EL0: return "PMEVTYPER22_EL0";
+  case PMEVTYPER23_EL0: return "PMEVTYPER23_EL0";
+  case PMEVTYPER24_EL0: return "PMEVTYPER24_EL0";
+  case PMEVTYPER25_EL0: return "PMEVTYPER25_EL0";
+  case PMEVTYPER26_EL0: return "PMEVTYPER26_EL0";
+  case PMEVTYPER27_EL0: return "PMEVTYPER27_EL0";
+  case PMEVTYPER28_EL0: return "PMEVTYPER28_EL0";
+  case PMEVTYPER29_EL0: return "PMEVTYPER29_EL0";
+  case PMEVTYPER30_EL0: return "PMEVTYPER30_EL0";
+  case PMCCFILTR_EL0: return "PMCCFILTR_EL0";
+  case RMR_EL3: return "RMR_EL3";
+  case RMR_EL2: return "RMR_EL2";
+  case RMR_EL1: return "RMR_EL1";
+  case CPM_IOACC_CTL_EL3: return "CPM_IOACC_CTL_EL3";
+  case MDCCSR_EL0: return "MDCCSR_EL0";
+  case MDCCINT_EL1: return "MDCCINT_EL1";
+  case DBGDTR_EL0: return "DBGDTR_EL0";
+  case DBGDTRRX_EL0: return "DBGDTRRX_EL0";
+  case DBGVCR32_EL2: return "DBGVCR32_EL2";
+  case OSDTRRX_EL1: return "OSDTRRX_EL1";
+  case MDSCR_EL1: return "MDSCR_EL1";
+  case OSDTRTX_EL1: return "OSDTRTX_EL1";
+  case OSECCR_EL11: return "OSECCR_EL11";
+  case DBGBVR0_EL1: return "DBGBVR0_EL1";
+  case DBGBVR1_EL1: return "DBGBVR1_EL1";
+  case DBGBVR2_EL1: return "DBGBVR2_EL1";
+  case DBGBVR3_EL1: return "DBGBVR3_EL1";
+  case DBGBVR4_EL1: return "DBGBVR4_EL1";
+  case DBGBVR5_EL1: return "DBGBVR5_EL1";
+  case DBGBVR6_EL1: return "DBGBVR6_EL1";
+  case DBGBVR7_EL1: return "DBGBVR7_EL1";
+  case DBGBVR8_EL1: return "DBGBVR8_EL1";
+  case DBGBVR9_EL1: return "DBGBVR9_EL1";
+  case DBGBVR10_EL1: return "DBGBVR10_EL1";
+  case DBGBVR11_EL1: return "DBGBVR11_EL1";
+  case DBGBVR12_EL1: return "DBGBVR12_EL1";
+  case DBGBVR13_EL1: return "DBGBVR13_EL1";
+  case DBGBVR14_EL1: return "DBGBVR14_EL1";
+  case DBGBVR15_EL1: return "DBGBVR15_EL1";
+  case DBGBCR0_EL1: return "DBGBCR0_EL1";
+  case DBGBCR1_EL1: return "DBGBCR1_EL1";
+  case DBGBCR2_EL1: return "DBGBCR2_EL1";
+  case DBGBCR3_EL1: return "DBGBCR3_EL1";
+  case DBGBCR4_EL1: return "DBGBCR4_EL1";
+  case DBGBCR5_EL1: return "DBGBCR5_EL1";
+  case DBGBCR6_EL1: return "DBGBCR6_EL1";
+  case DBGBCR7_EL1: return "DBGBCR7_EL1";
+  case DBGBCR8_EL1: return "DBGBCR8_EL1";
+  case DBGBCR9_EL1: return "DBGBCR9_EL1";
+  case DBGBCR10_EL1: return "DBGBCR10_EL1";
+  case DBGBCR11_EL1: return "DBGBCR11_EL1";
+  case DBGBCR12_EL1: return "DBGBCR12_EL1";
+  case DBGBCR13_EL1: return "DBGBCR13_EL1";
+  case DBGBCR14_EL1: return "DBGBCR14_EL1";
+  case DBGBCR15_EL1: return "DBGBCR15_EL1";
+  case DBGWVR0_EL1: return "DBGWVR0_EL1";
+  case DBGWVR1_EL1: return "DBGWVR1_EL1";
+  case DBGWVR2_EL1: return "DBGWVR2_EL1";
+  case DBGWVR3_EL1: return "DBGWVR3_EL1";
+  case DBGWVR4_EL1: return "DBGWVR4_EL1";
+  case DBGWVR5_EL1: return "DBGWVR5_EL1";
+  case DBGWVR6_EL1: return "DBGWVR6_EL1";
+  case DBGWVR7_EL1: return "DBGWVR7_EL1";
+  case DBGWVR8_EL1: return "DBGWVR8_EL1";
+  case DBGWVR9_EL1: return "DBGWVR9_EL1";
+  case DBGWVR10_EL1: return "DBGWVR10_EL1";
+  case DBGWVR11_EL1: return "DBGWVR11_EL1";
+  case DBGWVR12_EL1: return "DBGWVR12_EL1";
+  case DBGWVR13_EL1: return "DBGWVR13_EL1";
+  case DBGWVR14_EL1: return "DBGWVR14_EL1";
+  case DBGWVR15_EL1: return "DBGWVR15_EL1";
+  case DBGWCR0_EL1: return "DBGWCR0_EL1";
+  case DBGWCR1_EL1: return "DBGWCR1_EL1";
+  case DBGWCR2_EL1: return "DBGWCR2_EL1";
+  case DBGWCR3_EL1: return "DBGWCR3_EL1";
+  case DBGWCR4_EL1: return "DBGWCR4_EL1";
+  case DBGWCR5_EL1: return "DBGWCR5_EL1";
+  case DBGWCR6_EL1: return "DBGWCR6_EL1";
+  case DBGWCR7_EL1: return "DBGWCR7_EL1";
+  case DBGWCR8_EL1: return "DBGWCR8_EL1";
+  case DBGWCR9_EL1: return "DBGWCR9_EL1";
+  case DBGWCR10_EL1: return "DBGWCR10_EL1";
+  case DBGWCR11_EL1: return "DBGWCR11_EL1";
+  case DBGWCR12_EL1: return "DBGWCR12_EL1";
+  case DBGWCR13_EL1: return "DBGWCR13_EL1";
+  case DBGWCR14_EL1: return "DBGWCR14_EL1";
+  case DBGWCR15_EL1: return "DBGWCR15_EL1";
+  case MDRAR_EL1: return "MDRAR_EL1";
+  case OSLAR_EL1: return "OSLAR_EL1";
+  case OSLSR_EL1: return "OSLSR_EL1";
+  case OSDLR_EL1: return "OSDLR_EL1";
+  case DBGPRCR_EL1: return "DBGPRCR_EL1";
+  case DBGCLAIMSET_EL1: return "DBGCLAIMSET_EL1";
+  case DBGCLAIMCLR_EL1: return "DBGCLAIMCLR_EL1";
+  case DBGAUTHSTATUS_EL1: return "DBGAUTHSTATUS_EL1";
+  case DBGDEVID2: return "DBGDEVID2";
+  case DBGDEVID1: return "DBGDEVID1";
+  case DBGDEVID0: return "DBGDEVID0";
+  case ID_PFR0_EL1: return "ID_PFR0_EL1";
+  case ID_PFR1_EL1: return "ID_PFR1_EL1";
+  case ID_DFR0_EL1: return "ID_DFR0_EL1";
+  case ID_AFR0_EL1: return "ID_AFR0_EL1";
+  case ID_ISAR0_EL1: return "ID_ISAR0_EL1";
+  case ID_ISAR1_EL1: return "ID_ISAR1_EL1";
+  case ID_ISAR2_EL1: return "ID_ISAR2_EL1";
+  case ID_ISAR3_EL1: return "ID_ISAR3_EL1";
+  case ID_ISAR4_EL1: return "ID_ISAR4_EL1";
+  case ID_ISAR5_EL1: return "ID_ISAR5_EL1";
+  case AFSR1_EL1: return "AFSR1_EL1";
+  case AFSR0_EL1: return "AFSR0_EL1";
+  case REVIDR_EL1: return "REVIDR_EL1";
+  }
+}
+
+enum CPSRField {
+  InvalidCPSRField = 0xff,
+  cpsr_SPSel = 0x5,
+  cpsr_DAIFSet = 0x1e,
+  cpsr_DAIFClr = 0x1f
+};
+
+static inline const char *getCPSRFieldName(CPSRField Val) {
+  switch(Val) {
+  default: assert(0 && "Invalid system register value!");
+  case cpsr_SPSel: return "SPSel";
+  case cpsr_DAIFSet: return "DAIFSet";
+  case cpsr_DAIFClr: return "DAIFClr";
+  }
+}
+
+} // end namespace ARM64SYS
+
+namespace ARM64II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
+  };
+} // end namespace ARM64II
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
new file mode 100644
index 0000000..1a132a1
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
@@ -0,0 +1,237 @@
+//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class ARM64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARM64ELFObjectWriter(uint8_t OSABI);
+
+  virtual ~ARM64ELFObjectWriter();
+
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+
+private:
+};
+}
+
+ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
+
+ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {}
+
+unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  ARM64MCExpr::VariantKind RefKind =
+      static_cast<ARM64MCExpr::VariantKind>(Target.getRefKind());
+  ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = ARM64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_PREL16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case ARM64::fixup_arm64_pcrel_adr_imm21:
+      llvm_unreachable("No ELF relocations supported for ADR at the moment");
+    case ARM64::fixup_arm64_pcrel_adrp_imm21:
+      if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case ARM64::fixup_arm64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case ARM64::fixup_arm64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case ARM64::fixup_arm64_pcrel_imm19:
+      return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_ABS16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case ARM64::fixup_arm64_add_imm12:
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale1:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale2:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale4:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale8:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_ldst_imm12_scale16:
+      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case ARM64::fixup_arm64_movw:
+      if (RefKind == ARM64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == ARM64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == ARM64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == ARM64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == ARM64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == ARM64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == ARM64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case ARM64::fixup_arm64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
+    }
+  }
+
+  llvm_unreachable("Unimplemented fixup -> relocation");
+}
+
+MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
new file mode 100644
index 0000000..97a3493
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
@@ -0,0 +1,158 @@
+//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class ARM64ELFStreamer : public MCELFStreamer {
+public:
+  ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                   MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
+
+  ~ARM64ELFStreamer() {}
+
+  virtual void ChangeSection(const MCSection *Section,
+                             const MCExpr *Subsection) {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitBytes(StringRef Data) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data)
+      return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64)
+      return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection().first);
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  /// @}
+};
+}
+
+namespace llvm {
+MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack) {
+  ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
new file mode 100644
index 0000000..72dadbc
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
@@ -0,0 +1,26 @@
+//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the ARM64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_ELF_STREAMER_H
+#define LLVM_AARCH64_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack);
+}
+
+#endif // ARM64_ELF_STREAMER_H
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
new file mode 100644
index 0000000..02eb91f
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
@@ -0,0 +1,72 @@
+//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM64FIXUPKINDS_H
+#define LLVM_ARM64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM64 {
+
+enum Fixups {
+  // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_arm64_pcrel_adrp_imm21,
+
+  // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_arm64_add_imm12,
+
+  // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_arm64_ldst_imm12_scale1,
+  fixup_arm64_ldst_imm12_scale2,
+  fixup_arm64_ldst_imm12_scale4,
+  fixup_arm64_ldst_imm12_scale8,
+  fixup_arm64_ldst_imm12_scale16,
+
+  // FIXME: comment
+  fixup_arm64_movw,
+
+  // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_arm64_pcrel_branch14,
+
+  // fixup_arm64_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this
+  // is not used as part of a lo/hi pair and thus generates relocations
+  // directly when necessary.
+  fixup_arm64_pcrel_imm19,
+
+  // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_arm64_pcrel_branch26,
+
+  // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_arm64_pcrel_call26,
+
+  // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_arm64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace ARM64
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
new file mode 100644
index 0000000..97e0d3c
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
@@ -0,0 +1,92 @@
+//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARM64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "arm64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from ARM64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+ARM64MCAsmInfoELF::ARM64MCAsmInfoELF() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
new file mode 100644
index 0000000..f2d33a7
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
@@ -0,0 +1,36 @@
+//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARM64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64TARGETASMINFO_H
+#define ARM64TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit ARM64MCAsmInfoDarwin();
+  virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                    unsigned Encoding,
+                                                    MCStreamer &Streamer) const;
+};
+
+struct ARM64MCAsmInfoELF : public MCAsmInfo {
+  explicit ARM64MCAsmInfoELF();
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
new file mode 100644
index 0000000..19559f8
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
@@ -0,0 +1,563 @@
+//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "MCTargetDesc/ARM64BaseInfo.h"
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCExpr.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class ARM64MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+
+  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
+public:
+  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
+
+  ~ARM64MCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getAMIndexed8OpValue - Return encoding info for base register
+  /// and 12-bit unsigned immediate attached to a load, store or prfm
+  /// instruction. If operand requires a relocation, record it and
+  /// return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
+
+  assert(0 && "Unable to encode MCOperand!");
+  return 0;
+}
+
+template <uint32_t FixupKind>
+uint32_t
+ARM64MCCodeEmitter::getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  unsigned BaseReg = MI.getOperand(OpIdx).getReg();
+  BaseReg = Ctx.getRegisterInfo()->getEncodingValue(BaseReg);
+
+  const MCOperand &MO = MI.getOperand(OpIdx + 1);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
+  }
+
+  return BaseReg | (ImmVal << 5);
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+  const MCExpr *Expr = MO.getExpr();
+
+  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
+                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
+                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  MCNumFixups += 1;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
+  if (MO.isImm())
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
+  assert(ShiftVal == 0 && "shift not allowed on add/sub immediate with fixup");
+
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
+                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
+                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
+  }
+
+  assert(false && "Invalid value for vector shift amount!");
+  return 0;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
+
+uint32_t
+ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t
+ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "ARM64GenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
new file mode 100644
index 0000000..d4ab140
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
@@ -0,0 +1,168 @@
+//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+#include "ARM64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) ARM64MCExpr(Expr, Kind);
+}
+
+StringRef ARM64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
+
+void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
+
+const MCSection *ARM64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getSymbolLoc(Kind)) {
+  default:
+    return;
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
new file mode 100644
index 0000000..a33fe43
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
@@ -0,0 +1,162 @@
+//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes ARM64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM64MCEXPR_H
+#define LLVM_ARM64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class ARM64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_G0       = 0x030,
+    VK_G1       = 0x040,
+    VK_G2       = 0x050,
+    VK_G3       = 0x060,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
+  };
+
+private:
+  const MCExpr *Expr;
+  const VariantKind Kind;
+
+  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
+
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
+
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
+  }
+
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+  }
+
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+  /// @}
+
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
+
+  void PrintImpl(raw_ostream &OS) const;
+
+  void AddValueSymbols(MCAssembler *) const;
+
+  const MCSection *FindAssociatedSection() const;
+
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const ARM64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
new file mode 100644
index 0000000..8d54412
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
@@ -0,0 +1,167 @@
+//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM64MCTargetDesc.h"
+#include "ARM64ELFStreamer.h"
+#include "ARM64MCAsmInfo.h"
+#include "InstPrinter/ARM64InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARM64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARM64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "ARM64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createARM64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitARM64MCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitARM64MCRegisterInfo(X, ARM64::LR);
+  return X;
+}
+
+static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new ARM64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new ARM64MCAsmInfoELF();
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
+    CM = CodeModel::Large;
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error("Only small and large code models are allowed on ARM64");
+
+  // ARM64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARM64InstPrinter(MAI, MII, MRI, STI);
+  if (SyntaxVariant == 1)
+    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
+
+  return 0;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
+
+  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARM64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheARM64Target, createARM64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
+                                        createARM64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARM64Target, createARM64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARM64Target, createARM64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
+                                          createARM64MCSubtargetInfo);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARM64Target, createARM64AsmBackend);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
+                                        createARM64MCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
+                                        createARM64MCInstPrinter);
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
new file mode 100644
index 0000000..0db2b22
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
@@ -0,0 +1,62 @@
+//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM64MCTARGETDESC_H
+#define ARM64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheARM64Target;
+
+MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    StringRef TT, StringRef CPU);
+
+MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
+
+MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
+
+} // End llvm namespace
+
+// Defines symbolic names for ARM64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARM64GenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARM64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARM64GenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
new file mode 100644
index 0000000..1733dc5
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
@@ -0,0 +1,396 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARM64FixupKinds.h"
+#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue);
+};
+}
+
+bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case ARM64::fixup_arm64_add_imm12:
+  case ARM64::fixup_arm64_ldst_imm12_scale1:
+  case ARM64::fixup_arm64_ldst_imm12_scale2:
+  case ARM64::fixup_arm64_ldst_imm12_scale4:
+  case ARM64::fixup_arm64_ldst_imm12_scale8:
+  case ARM64::fixup_arm64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      assert(0 && "Unexpected symbol reference variant kind!");
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default:
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "ADR/ADRP relocations must be GOT relative");
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case ARM64::fixup_arm64_pcrel_branch26:
+  case ARM64::fixup_arm64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+void ARM64MachObjectWriter::RecordRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+
+  FixupOffset += Fixup.getOffset();
+
+  // ARM64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21 ||
+      Kind == ARM64::fixup_arm64_pcrel_imm19)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == ARM64::fixup_arm64_pcrel_imm19) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "conditional branch requires assembler-local"
+                                " label. '" +
+                                    Target.getSymA()->getSymbol().getName() +
+                                    "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                  Asm)) {
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
+
+    if (IsPCRel) {
+      IsExtern = 1;
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "PC relative absolute relocation!");
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Check for "_foo@got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo@got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(&B_SD) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported pc-relative relocation of "
+                                  "difference");
+
+    // ARM64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceeding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+    if (!B_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+
+    if (A_Base == B_Base && A_Base)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation with identical base");
+
+    Value += (A_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
+                                                   &A_SD, Layout)) -
+             (A_Base == NULL || A_Base->getFragment() == NULL
+                  ? 0
+                  : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= (B_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
+                                                   &B_SD, Layout)) -
+             (B_Base == NULL || B_Base->getFragment() == NULL
+                  ? 0
+                  : Writer->getSymbolAddress(B_Base, Layout));
+
+    Index = A_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    Index = B_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+        Asm.getContext().FatalError(Fixup.getLoc(),
+                                    "unable to resolve variable '" +
+                                        Symbol->getName() + "'");
+      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = 0;
+    }
+
+    // ARM64 uses external relocations as much as possible. For debug sections,
+    // and for pointer-sized relocations (.quad), we allow section relocations.
+    // It's code sections that run into trouble.
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection()) {
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+        Asm.getContext().FatalError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD.getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1ULL << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (SD.getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    IsPCRel = 0;
+    Log2Size = 2;
+    IsExtern = 0;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
+                                                  uint32_t CPUType,
+                                                  uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..f8665bc
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_library(LLVMARM64Desc
+  ARM64AsmBackend.cpp
+  ARM64ELFObjectWriter.cpp
+  ARM64ELFStreamer.cpp
+  ARM64MCAsmInfo.cpp
+  ARM64MCCodeEmitter.cpp
+  ARM64MCExpr.cpp
+  ARM64MCTargetDesc.cpp
+  ARM64MachObjectWriter.cpp
+)
+add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000..e4c74d2
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Desc
+parent = ARM64
+required_libraries = ARM64AsmPrinter ARM64Info MC Support
+add_to_library_groups = ARM64
+
diff --git a/lib/Target/ARM64/MCTargetDesc/Makefile b/lib/Target/ARM64/MCTargetDesc/Makefile
new file mode 100644
index 0000000..013cc63
--- /dev/null
+++ b/lib/Target/ARM64/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Makefile b/lib/Target/ARM64/Makefile
new file mode 100644
index 0000000..5f0f307
--- /dev/null
+++ b/lib/Target/ARM64/Makefile
@@ -0,0 +1,25 @@
+##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMARM64CodeGen
+TARGET = ARM64
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
+		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
+		ARM64GenDAGISel.inc \
+		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
+		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
+		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
+		ARM64GenMCPseudoLowering.inc
+
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
new file mode 100644
index 0000000..dec09ed
--- /dev/null
+++ b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
@@ -0,0 +1,21 @@
+//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target TheARM64Target;
+} // end namespace llvm
+
+extern "C" void LLVMInitializeARM64TargetInfo() {
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64Target, "arm64",
+                                                   "ARM64");
+}
diff --git a/lib/Target/ARM64/TargetInfo/CMakeLists.txt b/lib/Target/ARM64/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..a0142c4
--- /dev/null
+++ b/lib/Target/ARM64/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARM64Info
+  ARM64TargetInfo.cpp
+  )
+
+add_dependencies(LLVMARM64Info ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000..5bea694
--- /dev/null
+++ b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ARM64Info
+parent = ARM64
+required_libraries = MC Support
+add_to_library_groups = ARM64
+
diff --git a/lib/Target/ARM64/TargetInfo/Makefile b/lib/Target/ARM64/TargetInfo/Makefile
new file mode 100644
index 0000000..2d5a1a0
--- /dev/null
+++ b/lib/Target/ARM64/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARM64Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Android.mk b/lib/Target/Android.mk
index 60f5b18..1b43ce4 100644
--- a/lib/Target/Android.mk
+++ b/lib/Target/Android.mk
@@ -1,7 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 
 target_SRC_FILES := \
-  Mangler.cpp \
   Target.cpp \
   TargetIntrinsicInfo.cpp \
   TargetJITInfo.cpp \
@@ -26,6 +25,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(target_SRC_FILES)
@@ -36,3 +36,4 @@ LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_DEVICE_BUILD_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 02ac493..06a74d7 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMTarget
-  Mangler.cpp
   Target.cpp
   TargetIntrinsicInfo.cpp
   TargetJITInfo.cpp
@@ -10,6 +9,8 @@ add_llvm_library(LLVMTarget
   TargetSubtargetInfo.cpp
   )
 
+list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
+
 foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
   add_subdirectory(${t})
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index ddc7a66..afd1f51 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -131,6 +131,7 @@ namespace {
   private:
     void printLinkageType(GlobalValue::LinkageTypes LT);
     void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printDLLStorageClassType(GlobalValue::DLLStorageClassTypes DSCType);
     void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM);
     void printCallingConv(CallingConv::ID cc);
     void printEscapedString(const std::string& str);
@@ -282,10 +283,6 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::InternalLinkage"; break;
   case GlobalValue::PrivateLinkage:
     Out << "GlobalValue::PrivateLinkage"; break;
-  case GlobalValue::LinkerPrivateLinkage:
-    Out << "GlobalValue::LinkerPrivateLinkage"; break;
-  case GlobalValue::LinkerPrivateWeakLinkage:
-    Out << "GlobalValue::LinkerPrivateWeakLinkage"; break;
   case GlobalValue::AvailableExternallyLinkage:
     Out << "GlobalValue::AvailableExternallyLinkage "; break;
   case GlobalValue::LinkOnceAnyLinkage:
@@ -300,10 +297,6 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::AppendingLinkage"; break;
   case GlobalValue::ExternalLinkage:
     Out << "GlobalValue::ExternalLinkage"; break;
-  case GlobalValue::DLLImportLinkage:
-    Out << "GlobalValue::DLLImportLinkage"; break;
-  case GlobalValue::DLLExportLinkage:
-    Out << "GlobalValue::DLLExportLinkage"; break;
   case GlobalValue::ExternalWeakLinkage:
     Out << "GlobalValue::ExternalWeakLinkage"; break;
   case GlobalValue::CommonLinkage:
@@ -325,6 +318,21 @@ void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
   }
 }
 
+void CppWriter::printDLLStorageClassType(
+                                    GlobalValue::DLLStorageClassTypes DSCType) {
+  switch (DSCType) {
+  case GlobalValue::DefaultStorageClass:
+    Out << "GlobalValue::DefaultStorageClass";
+    break;
+  case GlobalValue::DLLImportStorageClass:
+    Out << "GlobalValue::DLLImportStorageClass";
+    break;
+  case GlobalValue::DLLExportStorageClass:
+    Out << "GlobalValue::DLLExportStorageClass";
+    break;
+  }
+}
+
 void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) {
   switch (TLM) {
     case GlobalVariable::NotThreadLocal:
@@ -361,25 +369,25 @@ void CppWriter::printEscapedString(const std::string &Str) {
 }
 
 std::string CppWriter::getCppName(Type* Ty) {
-  // First, handle the primitive types .. easy
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
-    switch (Ty->getTypeID()) {
-    case Type::VoidTyID:   return "Type::getVoidTy(mod->getContext())";
-    case Type::IntegerTyID: {
-      unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
-      return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
-    }
-    case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(mod->getContext())";
-    case Type::FloatTyID:    return "Type::getFloatTy(mod->getContext())";
-    case Type::DoubleTyID:   return "Type::getDoubleTy(mod->getContext())";
-    case Type::LabelTyID:    return "Type::getLabelTy(mod->getContext())";
-    case Type::X86_MMXTyID:  return "Type::getX86_MMXTy(mod->getContext())";
-    default:
-      error("Invalid primitive type");
-      break;
-    }
-    // shouldn't be returned, but make it sensible
+  switch (Ty->getTypeID()) {
+  default:
+    break;
+  case Type::VoidTyID:
     return "Type::getVoidTy(mod->getContext())";
+  case Type::IntegerTyID: {
+    unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+    return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
+  }
+  case Type::X86_FP80TyID:
+    return "Type::getX86_FP80Ty(mod->getContext())";
+  case Type::FloatTyID:
+    return "Type::getFloatTy(mod->getContext())";
+  case Type::DoubleTyID:
+    return "Type::getDoubleTy(mod->getContext())";
+  case Type::LabelTyID:
+    return "Type::getLabelTy(mod->getContext())";
+  case Type::X86_MMXTyID:
+    return "Type::getX86_MMXTy(mod->getContext())";
   }
 
   // Now, see if we've seen the type before and return that
@@ -491,6 +499,7 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
       HANDLE_ATTR(NoUnwind);
       HANDLE_ATTR(NoAlias);
       HANDLE_ATTR(ByVal);
+      HANDLE_ATTR(InAlloca);
       HANDLE_ATTR(Nest);
       HANDLE_ATTR(ReadNone);
       HANDLE_ATTR(ReadOnly);
@@ -537,7 +546,8 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
 
 void CppWriter::printType(Type* Ty) {
   // We don't print definitions for primitive types
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy())
+  if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() ||
+      Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy())
     return;
 
   // If we already defined this type, we don't need to define it again.
@@ -1026,6 +1036,13 @@ void CppWriter::printVariableHead(const GlobalVariable *GV) {
     Out << ");";
     nl(Out);
   }
+  if (GV->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+    printCppName(GV);
+    Out << "->setDLLStorageClass(";
+    printDLLStorageClassType(GV->getDLLStorageClass());
+    Out << ");";
+    nl(Out);
+  }
   if (GV->isThreadLocal()) {
     printCppName(GV);
     Out << "->setThreadLocalMode(";
@@ -1546,12 +1563,16 @@ void CppWriter::printInstruction(const Instruction *I,
   }
   case Instruction::AtomicCmpXchg: {
     const AtomicCmpXchgInst *cxi = cast<AtomicCmpXchgInst>(I);
-    StringRef Ordering = ConvertAtomicOrdering(cxi->getOrdering());
+    StringRef SuccessOrdering =
+        ConvertAtomicOrdering(cxi->getSuccessOrdering());
+    StringRef FailureOrdering =
+        ConvertAtomicOrdering(cxi->getFailureOrdering());
     StringRef CrossThread = ConvertAtomicSynchScope(cxi->getSynchScope());
     Out << "AtomicCmpXchgInst* " << iName
         << " = new AtomicCmpXchgInst("
         << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", "
-        << Ordering << ", " << CrossThread << ", " << bbname
+        << SuccessOrdering << ", " << FailureOrdering << ", "
+        << CrossThread << ", " << bbname
         << ");";
     nl(Out) << iName << "->setName(\"";
     printEscapedString(cxi->getName());
@@ -1744,6 +1765,13 @@ void CppWriter::printFunctionHead(const Function* F) {
     Out << ");";
     nl(Out);
   }
+  if (F->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+    printCppName(F);
+    Out << "->setDLLStorageClass(";
+    printDLLStorageClassType(F->getDLLStorageClass());
+    Out << ");";
+    nl(Out);
+  }
   if (F->hasGC()) {
     printCppName(F);
     Out << "->setGC(\"" << F->getGC() << "\");";
@@ -1916,13 +1944,13 @@ void CppWriter::printProgram(const std::string& fname,
 
   Out << "#include <llvm/ADT/SmallVector.h>\n";
   Out << "#include <llvm/Analysis/Verifier.h>\n";
-  Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
   Out << "#include <llvm/IR/BasicBlock.h>\n";
   Out << "#include <llvm/IR/CallingConv.h>\n";
   Out << "#include <llvm/IR/Constants.h>\n";
   Out << "#include <llvm/IR/DerivedTypes.h>\n";
   Out << "#include <llvm/IR/Function.h>\n";
   Out << "#include <llvm/IR/GlobalVariable.h>\n";
+  Out << "#include <llvm/IR/IRPrintingPasses.h>\n";
   Out << "#include <llvm/IR/InlineAsm.h>\n";
   Out << "#include <llvm/IR/Instructions.h>\n";
   Out << "#include <llvm/IR/LLVMContext.h>\n";
diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
index f82d72e..d86446f 100644
--- a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
@@ -1,5 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMCppBackendInfo
   CppBackendTargetInfo.cpp
   )
diff --git a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
index 1ca74a4..096dc73 100644
--- a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
+++ b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
@@ -14,9 +14,10 @@ using namespace llvm;
 
 Target llvm::TheCppBackendTarget;
 
-static unsigned CppBackend_TripleMatchQuality(const std::string &TT) {
-  // This class always works, but shouldn't be the default in most cases.
-  return 1;
+static bool CppBackend_TripleMatchQuality(Triple::ArchType Arch) {
+  // This backend doesn't correspond to any architecture. It must be explicitly
+  // selected with -march.
+  return false;
 }
 
 extern "C" void LLVMInitializeCppBackendTargetInfo() { 
diff --git a/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt b/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
index d4dfc3e..9c186a5 100644
--- a/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/CppBackend/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = CppBackendInfo
 parent = CppBackend
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = CppBackend
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index ae3c9eb..81b0e56 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -37,8 +37,6 @@ add_llvm_target(HexagonCodeGen
   HexagonCopyToCombine.cpp
 )
 
-add_dependencies(LLVMHexagonCodeGen HexagonCommonTableGen intrinsics_gen)
-
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 568798c..c1b6d45 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -205,14 +205,6 @@ def : Proc<"hexagonv3", HexagonModel,   [ArchV2, ArchV3]>;
 def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
 def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
 
-
-// Hexagon Uses the MC printer for assembler output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def HexagonAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
@@ -220,6 +212,4 @@ def HexagonAsmWriter : AsmWriter {
 def Hexagon : Target {
   // Pull in Instruction Info:
   let InstructionSet = HexagonInstrInfo;
-
-  let AssemblyWriters = [HexagonAsmWriter];
 }
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index a2e04ba..a588274 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -17,15 +17,14 @@
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
-#include "HexagonTargetMachine.h"
 #include "HexagonSubtarget.h"
-#include "MCTargetDesc/HexagonMCInst.h"
+#include "HexagonTargetMachine.h"
 #include "InstPrinter/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInst.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -34,6 +33,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -49,7 +49,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
@@ -61,17 +60,6 @@ static cl::opt<bool> AlignCalls(
          "hexagon-align-calls", cl::Hidden, cl::init(true),
           cl::desc("Insert falign after call instruction for Hexagon target"));
 
-void HexagonAsmPrinter::EmitAlignment(unsigned NumBits,
-                                      const GlobalValue *GV) const {
-  // For basic block level alignment, use ".falign".
-  if (!GV) {
-    OutStreamer.EmitRawText(StringRef("\t.falign"));
-    return;
-  }
-
-  AsmPrinter::EmitAlignment(NumBits, GV);
-}
-
 void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                     raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
@@ -87,16 +75,9 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   case MachineOperand::MO_MachineBasicBlock:
     O << *MO.getMBB()->getSymbol();
     return;
-  case MachineOperand::MO_JumpTableIndex:
-    O << *GetJTISymbol(MO.getIndex());
-    // FIXME: PIC relocation model.
-    return;
   case MachineOperand::MO_ConstantPoolIndex:
     O << *GetCPISymbol(MO.getIndex());
     return;
-  case MachineOperand::MO_ExternalSymbol:
-    O << *GetExternalSymbolSymbol(MO.getSymbolName());
-    return;
   case MachineOperand::MO_GlobalAddress:
     // Computing the address of a global symbol, not calling it.
     O << *getSymbol(MO.getGlobal());
@@ -186,12 +167,6 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI,
-                                              unsigned OpNo,
-                                              raw_ostream &O) {
-  llvm_unreachable("Unimplemented");
-}
-
 
 /// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
 /// the current output stream.
@@ -224,7 +199,7 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCI.setPacketEnd(Index == (Size-1));
 
       HexagonLowerToMC(BundleMIs[Index], MCI, *this);
-      OutStreamer.EmitInstruction(MCI);
+      EmitToStreamer(OutStreamer, MCI);
     }
   }
   else {
@@ -234,66 +209,12 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCI.setPacketEnd(true);
     }
     HexagonLowerToMC(MI, MCI, *this);
-    OutStreamer.EmitInstruction(MCI);
+    EmitToStreamer(OutStreamer, MCI);
   }
 
   return;
 }
 
-/// PrintUnmangledNameSafely - Print out the printable characters in the name.
-/// Don't print things like \n or \0.
-// static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
-//   for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
-//        Name != E; ++Name)
-//     if (isprint(*Name))
-//       OS << *Name;
-// }
-
-
-void HexagonAsmPrinter::printAddrModeBasePlusOffset(const MachineInstr *MI,
-                                                    int OpNo, raw_ostream &O) {
-  const MachineOperand &MO1 = MI->getOperand(OpNo);
-  const MachineOperand &MO2 = MI->getOperand(OpNo+1);
-
-  O << HexagonInstPrinter::getRegisterName(MO1.getReg())
-    << " + #"
-    << MO2.getImm();
-}
-
-
-void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
-                                           raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNo);
-  assert( (MO.getType() == MachineOperand::MO_GlobalAddress) &&
-         "Expecting global address");
-
-  O << *getSymbol(MO.getGlobal());
-  if (MO.getOffset() != 0) {
-    O << " + ";
-    O << MO.getOffset();
-  }
-}
-
-void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo,
-                                       raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNo);
-  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) && 
-           "Expecting jump table index");
-
-  // Hexagon_TODO: Do we need name mangling?
-  O << *GetJTISymbol(MO.getIndex());
-}
-
-void HexagonAsmPrinter::printConstantPool(const MachineInstr *MI, int OpNo,
-                                       raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNo);
-  assert( (MO.getType() == MachineOperand::MO_ConstantPoolIndex) &&
-          "Expecting constant pool index");
-
-  // Hexagon_TODO: Do we need name mangling?
-  O << *GetCPISymbol(MO.getIndex());
-}
-
 static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index bc2af63..a186dc9 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -37,8 +37,6 @@ namespace llvm {
     bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
 
     virtual void EmitInstruction(const MachineInstr *MI);
-    virtual void EmitAlignment(unsigned NumBits,
-                               const GlobalValue *GV = 0) const;
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -48,116 +46,7 @@ namespace llvm {
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &OS);
 
-    /// printInstruction - This method is automatically generated by tablegen
-    /// from the instruction set description.  This method returns true if the
-    /// machine instruction was sufficiently described to print it, otherwise it
-    /// returns false.
-    void printInstruction(const MachineInstr *MI, raw_ostream &O);
-
-    //    void printMachineInstruction(const MachineInstr *MI);
-    void printOp(const MachineOperand &MO, raw_ostream &O);
-
-    /// printRegister - Print register according to target requirements.
-    ///
-    void printRegister(const MachineOperand &MO, bool R0AsZero,
-                       raw_ostream &O) {
-      unsigned RegNo = MO.getReg();
-      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
-      O << getRegisterName(RegNo);
-    }
-
-    void printImmOperand(const MachineInstr *MI, unsigned OpNo,
-                                raw_ostream &O) {
-      int value = MI->getOperand(OpNo).getImm();
-      O << value;
-    }
-
-    void printNegImmOperand(const MachineInstr *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-      int value = MI->getOperand(OpNo).getImm();
-      O << -value;
-    }
-
-    void printMEMriOperand(const MachineInstr *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-      const MachineOperand &MO1 = MI->getOperand(OpNo);
-      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
-
-      O << getRegisterName(MO1.getReg())
-        << " + #"
-        << (int) MO2.getImm();
-    }
-
-    void printFrameIndexOperand(const MachineInstr *MI, unsigned OpNo,
-                                       raw_ostream &O) {
-      const MachineOperand &MO1 = MI->getOperand(OpNo);
-      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
-
-      O << getRegisterName(MO1.getReg())
-        << ", #"
-        << MO2.getImm();
-    }
-
-    void printBranchOperand(const MachineInstr *MI, unsigned OpNo,
-                            raw_ostream &O) {
-      // Branches can take an immediate operand.  This is used by the branch
-      // selection pass to print $+8, an eight byte displacement from the PC.
-      if (MI->getOperand(OpNo).isImm()) {
-        O << "$+" << MI->getOperand(OpNo).getImm()*4;
-      } else {
-        printOp(MI->getOperand(OpNo), O);
-      }
-    }
-
-    void printCallOperand(const MachineInstr *MI, unsigned OpNo,
-                          raw_ostream &O) {
-    }
-
-    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo,
-                             raw_ostream &O) {
-    }
-
-    void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
-      O << "#HI(";
-      if (MI->getOperand(OpNo).isImm()) {
-        printImmOperand(MI, OpNo, O);
-      }
-      else {
-        printOp(MI->getOperand(OpNo), O);
-      }
-      O << ")";
-    }
-
-    void printSymbolLo(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
-      O << "#HI(";
-      if (MI->getOperand(OpNo).isImm()) {
-        printImmOperand(MI, OpNo, O);
-      }
-      else {
-        printOp(MI->getOperand(OpNo), O);
-      }
-      O << ")";
-    }
-
-    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
-                               raw_ostream &O);
-
-#if 0
-    void printModuleLevelGV(const GlobalVariable* GVar, raw_ostream &O);
-#endif
-
-    void printAddrModeBasePlusOffset(const MachineInstr *MI, int OpNo,
-                                     raw_ostream &O);
-
-    void printGlobalOperand(const MachineInstr *MI, int OpNo, raw_ostream &O);
-    void printJumpTable(const MachineInstr *MI, int OpNo, raw_ostream &O);
-    void printConstantPool(const MachineInstr *MI, int OpNo, raw_ostream &O);
-
     static const char *getRegisterName(unsigned RegNo);
-
-#if 0
-    void EmitStartOfAsmFile(Module &M);
-#endif
   };
 
 } // end of llvm namespace
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.h b/lib/Target/Hexagon/HexagonCallingConvLower.h
index 33c8306..70b8b64 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.h
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
 
 //
 // Need to handle varargs.
@@ -29,7 +28,7 @@ namespace llvm {
   class TargetMachine;
   class Hexagon_CCState;
   class SDNode;
-
+  struct EVT;
 
 /// Hexagon_CCAssignFn - This function assigns a location for Val, updating
 /// State to reflect the change.
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index dc440cb..60c933b 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -14,26 +14,25 @@
 #define DEBUG_TYPE "hexagon-copy-combine"
 
 #include "llvm/PassSupport.h"
-#include "llvm/ADT/DenseSet.h"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
-#include "Hexagon.h"
-#include "HexagonInstrInfo.h"
-#include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
-#include "HexagonTargetMachine.h"
-#include "HexagonMachineFunctionInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -286,7 +285,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
       // Update the intermediate instruction to with the kill flag.
       if (KillingInstr) {
         bool Added = KillingInstr->addRegisterKilled(KilledOperand, TRI, true);
-        (void)Added; // supress compiler warning
+        (void)Added; // suppress compiler warning
         assert(Added && "Must successfully update kill flag");
         removeKillInfo(I2, KilledOperand);
       }
@@ -301,7 +300,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     MachineBasicBlock::iterator I(I1), End(I2);
     // At O3 we got better results (dhrystone) by being more conservative here.
     if (!ShouldCombineAggressively)
-      End = llvm::next(MachineBasicBlock::iterator(I2));
+      End = std::next(MachineBasicBlock::iterator(I2));
     IsImmUseReg = I1->getOperand(1).isImm() || I1->getOperand(1).isGlobal();
     unsigned I1UseReg = IsImmUseReg ? 0 : I1->getOperand(1).getReg();
     // Track killed operands. If we move across an instruction that kills our
@@ -344,7 +343,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
       // Update I1 to set the kill flag. This flag will later be picked up by
       // the new COMBINE instruction.
       bool Added = I1->addRegisterKilled(KilledOperand, TRI);
-      (void)Added; // supress compiler warning
+      (void)Added; // suppress compiler warning
       assert(Added && "Must successfully update kill flag");
     }
     DoInsertAtI1 = false;
@@ -465,7 +464,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
 /// false if the combine must be inserted at the returned instruction.
 MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
                                                  bool &DoInsertAtI1) {
-  MachineBasicBlock::iterator I2 = llvm::next(MachineBasicBlock::iterator(I1));
+  MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
   unsigned I1DestReg = I1->getOperand(0).getReg();
 
   for (MachineBasicBlock::iterator End = I1->getParent()->end(); I2 != End;
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index 240cc95..a79264b 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -15,6 +15,8 @@
 
 
 #include "llvm/ADT/DenseMap.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -22,8 +24,6 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 2b04f25..0ea13d4 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -144,14 +144,14 @@ bool HexagonFrameLowering::hasTailCall(MachineBasicBlock &MBB) const {
 
 void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineBasicBlock::iterator MBBI = std::prev(MBB.end());
   DebugLoc dl = MBBI->getDebugLoc();
   //
   // Only insert deallocframe if we need to.  Also at -O0.  See comment
   // in emitPrologue above.
   //
   if (hasFP(MF) || MF.getTarget().getOptLevel() == CodeGenOpt::None) {
-    MachineBasicBlock::iterator MBBI = prior(MBB.end());
+    MachineBasicBlock::iterator MBBI = std::prev(MBB.end());
     MachineBasicBlock::iterator MBBI_end = MBB.end();
 
     const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
@@ -170,7 +170,7 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
       // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC
       // instruction if we encounter it.
       MachineBasicBlock::iterator BeforeJMPR =
-        MBB.begin() == MBBI ? MBBI : prior(MBBI);
+        MBB.begin() == MBBI ? MBBI : std::prev(MBBI);
       if (BeforeJMPR != MBBI &&
           BeforeJMPR->getOpcode() == Hexagon::RESTORE_DEALLOC_RET_JMP_V4) {
         // Remove the JMPR node.
@@ -190,7 +190,7 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
       // DEALLOCFRAME instruction after it.
       MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
       MachineBasicBlock::iterator I =
-        Term == MBB.begin() ?  MBB.end() : prior(Term);
+        Term == MBB.begin() ?  MBB.end() : std::prev(Term);
       if (I != MBB.end() &&
           I->getOpcode() == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4)
         return;
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 52d5ab2..936fb11 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -28,6 +28,8 @@
 
 #define DEBUG_TYPE "hwloops"
 #include "llvm/ADT/SmallSet.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -40,9 +42,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-
 #include <algorithm>
 #include <vector>
 
@@ -908,10 +907,10 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
     // this instruction is dead: both it (and the phi node) can be removed.
     use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
     use_nodbg_iterator End = MRI->use_nodbg_end();
-    if (llvm::next(I) != End || !I.getOperand().getParent()->isPHI())
+    if (std::next(I) != End || !I->getParent()->isPHI())
       return false;
 
-    MachineInstr *OnePhi = I.getOperand().getParent();
+    MachineInstr *OnePhi = I->getParent();
     for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
       const MachineOperand &OPO = OnePhi->getOperand(j);
       if (!OPO.isReg() || !OPO.isDef())
@@ -921,8 +920,8 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
       use_nodbg_iterator nextJ;
       for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
            J != End; J = nextJ) {
-        nextJ = llvm::next(J);
-        MachineOperand &Use = J.getOperand();
+        nextJ = std::next(J);
+        MachineOperand &Use = *J;
         MachineInstr *UseMI = Use.getParent();
 
         // If the phi node has a user that is not MI, bail...
@@ -955,9 +954,9 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
       MachineRegisterInfo::use_iterator nextI;
       for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
            E = MRI->use_end(); I != E; I = nextI) {
-        nextI = llvm::next(I);  // I is invalidated by the setReg
-        MachineOperand &Use = I.getOperand();
-        MachineInstr *UseMI = Use.getParent();
+        nextI = std::next(I);  // I is invalidated by the setReg
+        MachineOperand &Use = *I;
+        MachineInstr *UseMI = I->getParent();
         if (UseMI == MI)
           continue;
         if (Use.isDebug())
@@ -1163,7 +1162,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
   // Out of order.
   unsigned PredR = CmpI->getOperand(0).getReg();
   bool FoundBump = false;
-  instr_iterator CmpIt = CmpI, NextIt = llvm::next(CmpIt);
+  instr_iterator CmpIt = CmpI, NextIt = std::next(CmpIt);
   for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
     MachineInstr *In = &*I;
     for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
@@ -1177,7 +1176,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
     if (In == BumpI) {
       instr_iterator After = BumpI;
       instr_iterator From = CmpI;
-      BB->splice(llvm::next(After), BB, From);
+      BB->splice(std::next(After), BB, From);
       FoundBump = true;
       break;
     }
@@ -1523,7 +1522,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
     if (PB != Latch) {
       Tmp2.clear();
       bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp2, false);
-      (void)NotAnalyzed; // supress compiler warning
+      (void)NotAnalyzed; // suppress compiler warning
       assert (!NotAnalyzed && "Should be analyzable!");
       if (TB != Header && (Tmp2.empty() || FB != Header))
         TII->InsertBranch(*PB, NewPH, 0, EmptyCond, DL);
@@ -1535,7 +1534,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   // Insert an unconditional branch to the header.
   TB = FB = 0;
   bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
-  (void)LatchNotAnalyzed; // supress compiler warning
+  (void)LatchNotAnalyzed; // suppress compiler warning
   assert (!LatchNotAnalyzed && "Should be analyzable!");
   if (!TB && !FB)
     TII->InsertBranch(*Latch, Header, 0, EmptyCond, DL);
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 5ae9328..ed8c786 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -16,8 +16,8 @@
 #include "HexagonISelLowering.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -1639,7 +1639,7 @@ bool HexagonDAGToDAGISel::hasNumUsesBelowThresGA(SDNode *N) const {
 }
 
 //===--------------------------------------------------------------------===//
-// Return true if the non GP-relative global address can be folded.
+// Return true if the non-GP-relative global address can be folded.
 //===--------------------------------------------------------------------===//
 inline bool HexagonDAGToDAGISel::foldGlobalAddress(SDValue &N, SDValue &R) {
   return foldGlobalAddressImpl(N, R, false);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 1374179..92b794d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -981,6 +981,9 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 6b97609..21a12de 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -147,7 +147,7 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
         if (isPredicated(Term) && !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond,
                                                  false)) {
           MachineBasicBlock *NextBB =
-            llvm::next(MachineFunction::iterator(&MBB));
+            std::next(MachineFunction::iterator(&MBB));
           if (NewTBB == NextBB) {
             ReverseBranchCondition(Cond);
             RemoveBranch(MBB);
@@ -1539,7 +1539,7 @@ int HexagonInstrInfo::GetDotOldOp(const int opc) const {
       assert(0 && "Couldn't change predicate new instruction to its old form.");
   }
 
-  if (isNewValueStore(NewOp)) { // Convert into non new-value format
+  if (isNewValueStore(NewOp)) { // Convert into non-new-value format
     NewOp = Hexagon::getNonNVStore(NewOp);
     if (NewOp < 0)
       assert(0 && "Couldn't change new-value store to its old form.");
@@ -1654,7 +1654,7 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
     return false;
 
   // Terminators and labels can't be scheduled around.
-  if (MI->getDesc().isTerminator() || MI->isLabel() || MI->isInlineAsm())
+  if (MI->getDesc().isTerminator() || MI->isPosition() || MI->isInlineAsm())
     return true;
 
   return false;
@@ -1793,7 +1793,7 @@ bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const {
     return true;
 
   if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
-    // Check addressing mode and retreive non-ext equivalent instruction.
+    // Check addressing mode and retrieve non-ext equivalent instruction.
 
     switch (getAddrMode(MI)) {
     case HexagonII::Absolute :
@@ -1827,7 +1827,7 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
       return NonExtOpcode;
 
   if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
-    // Check addressing mode and retreive non-ext equivalent instruction.
+    // Check addressing mode and retrieve non-ext equivalent instruction.
     switch (getAddrMode(MI)) {
     case HexagonII::Absolute :
       return Hexagon::getBasedWithImmOffset(MI->getOpcode());
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 3f45b8b..5da23cb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -16,15 +16,17 @@
 
 #include "HexagonRegisterInfo.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "HexagonGenInstrInfo.inc"
 
 namespace llvm {
 
+struct EVT;
+
 class HexagonInstrInfo : public HexagonGenInstrInfo {
   virtual void anchor();
   const HexagonRegisterInfo RI;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 475c23d..a95fb80 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -1016,7 +1016,7 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
       bits<5> src1;
       bits<5> src2;
       bits<3> Ns;    // New-Value Operand
-      bits<5> RegOp; // Non New-Value Operand
+      bits<5> RegOp; // Non-New-Value Operand
       bits<11> offset;
 
       let isBrTaken = !if(isTaken, "true", "false");
@@ -3198,7 +3198,7 @@ def : Pat<(i64 (cttz (i64 DoubleRegs:$src1))),
 
 
 // i8 -> i64 loads
-// We need a complexity of 120 here to overide preceeding handling of
+// We need a complexity of 120 here to override preceding handling of
 // zextloadi8.
 let Predicates = [HasV4T], AddedComplexity = 120 in {
 def:  Pat <(i64 (extloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
@@ -3220,7 +3220,7 @@ def:  Pat <(i64 (sextloadi8 FoldGlobalAddr:$addr)),
       (i64 (SXTW (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
 }
 // i16 -> i64 loads
-// We need a complexity of 120 here to overide preceeding handling of
+// We need a complexity of 120 here to override preceding handling of
 // zextloadi16.
 let AddedComplexity = 120 in {
 def:  Pat <(i64 (extloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
@@ -3248,7 +3248,7 @@ def:  Pat <(i64 (sextloadi16 FoldGlobalAddr:$addr)),
       Requires<[HasV4T]>;
 }
 // i32->i64 loads
-// We need a complexity of 120 here to overide preceeding handling of
+// We need a complexity of 120 here to override preceding handling of
 // zextloadi32.
 let AddedComplexity = 120 in {
 def:  Pat <(i64 (extloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index bbb2fa4..5e4346d 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -18,9 +18,9 @@
 #include "MCTargetDesc/HexagonMCInst.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index a59c8c9..d799bdb 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -10,8 +10,8 @@
 #ifndef HexagonMACHINEFUNCTIONINFO_H
 #define HexagonMACHINEFUNCTIONINFO_H
 
-#include <map>
 #include "llvm/CodeGen/MachineFunction.h"
+#include <map>
 
 namespace llvm {
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index c94f081..51318ff 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -108,7 +108,7 @@ bool VLIWResourceModel::reserveResources(SUnit *SU) {
   case TargetOpcode::REG_SEQUENCE:
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
-  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::CFI_INSTRUCTION:
   case TargetOpcode::EH_LABEL:
   case TargetOpcode::COPY:
   case TargetOpcode::INLINEASM:
@@ -186,6 +186,9 @@ void VLIWMachineScheduler::schedule() {
     scheduleMI(SU, IsTopNode);
 
     updateQueues(SU, IsTopNode);
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
   }
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
@@ -266,7 +269,7 @@ void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
 /// can dispatch per cycle.
 ///
 /// TODO: Also check whether the SU must start a new group.
-bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
   if (HazardRec->isEnabled())
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
@@ -277,7 +280,7 @@ bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) {
   return false;
 }
 
-void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU,
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU,
                                                      unsigned ReadyCycle) {
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
@@ -292,7 +295,7 @@ void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU,
 }
 
 /// Move the boundary of scheduled code by one cycle.
-void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() {
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
   unsigned Width = SchedModel->getIssueWidth();
   IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
 
@@ -318,7 +321,7 @@ void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() {
 }
 
 /// Move the boundary of scheduled code by one SUnit.
-void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
   bool startNewCycle = false;
 
   // Update the reservation table.
@@ -348,7 +351,7 @@ void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) {
 
 /// Release pending ready nodes in to the available queue. This makes them
 /// visible to heuristics.
-void ConvergingVLIWScheduler::SchedBoundary::releasePending() {
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
     MinReadyCycle = UINT_MAX;
@@ -376,7 +379,7 @@ void ConvergingVLIWScheduler::SchedBoundary::releasePending() {
 }
 
 /// Remove SU from the ready set for this boundary.
-void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) {
+void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
   if (Available.isInQueue(SU))
     Available.remove(Available.find(SU));
   else {
@@ -388,7 +391,7 @@ void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) {
 /// If this queue only has one ready candidate, return it. As a side effect,
 /// advance the cycle until at least one node is ready. If multiple instructions
 /// are ready, return NULL.
-SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
+SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 8ac333f..300f1c7 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -92,14 +92,14 @@ VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
 
 /// Extend the standard ScheduleDAGMI to provide more context and override the
 /// top-level schedule() driver.
-class VLIWMachineScheduler : public ScheduleDAGMI {
+class VLIWMachineScheduler : public ScheduleDAGMILive {
 public:
   VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
-    ScheduleDAGMI(C, S) {}
+    ScheduleDAGMILive(C, S) {}
 
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
-  virtual void schedule();
+  virtual void schedule() override;
   /// Perform platform specific DAG postprocessing.
   void postprocessDAG();
 };
@@ -130,7 +130,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
   /// Each Scheduling boundary is associated with ready queues. It tracks the
   /// current cycle in whichever direction at has moved, and maintains the state
   /// of "hazards" and other interlocks at the current cycle.
-  struct SchedBoundary {
+  struct VLIWSchedBoundary {
     VLIWMachineScheduler *DAG;
     const TargetSchedModel *SchedModel;
 
@@ -152,14 +152,14 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
 
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
-    SchedBoundary(unsigned ID, const Twine &Name):
+    VLIWSchedBoundary(unsigned ID, const Twine &Name):
       DAG(0), SchedModel(0), Available(ID, Name+".A"),
       Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
       CheckPending(false), HazardRec(0), ResourceModel(0),
       CurrCycle(0), IssueCount(0),
       MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
 
-    ~SchedBoundary() {
+    ~VLIWSchedBoundary() {
       delete ResourceModel;
       delete HazardRec;
     }
@@ -192,8 +192,8 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
   const TargetSchedModel *SchedModel;
 
   // State of the top and bottom scheduled instruction boundaries.
-  SchedBoundary Top;
-  SchedBoundary Bot;
+  VLIWSchedBoundary Top;
+  VLIWSchedBoundary Bot;
 
 public:
   /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
@@ -206,15 +206,15 @@ public:
   ConvergingVLIWScheduler():
     DAG(0), SchedModel(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
 
-  virtual void initialize(ScheduleDAGMI *dag);
+  virtual void initialize(ScheduleDAGMI *dag) override;
 
-  virtual SUnit *pickNode(bool &IsTopNode);
+  virtual SUnit *pickNode(bool &IsTopNode) override;
 
-  virtual void schedNode(SUnit *SU, bool IsTopNode);
+  virtual void schedNode(SUnit *SU, bool IsTopNode) override;
 
-  virtual void releaseTopNode(SUnit *SU);
+  virtual void releaseTopNode(SUnit *SU) override;
 
-  virtual void releaseBottomNode(SUnit *SU);
+  virtual void releaseBottomNode(SUnit *SU) override;
 
   unsigned ReportPackets() {
     return Top.ResourceModel->getTotalPackets() +
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index f7c4513..3e238bf 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -23,30 +23,28 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "hexagon-nvj"
 #include "llvm/PassSupport.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/LiveVariables.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-#include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
-#include "HexagonInstrInfo.h"
-#include "HexagonMachineFunctionInfo.h"
-
 #include <map>
-
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 1786e9d..9a20dfd 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -14,26 +14,26 @@
 
 #include "HexagonRegisterInfo.h"
 #include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
-#include "HexagonMachineFunctionInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 44234e8..cadcb32 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -42,6 +42,7 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<MachineFunctionAnalysis>();
       AU.addPreserved<MachineFunctionAnalysis>();
+      AU.addPreserved("stack-protector");
       FunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -59,18 +60,17 @@ bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
     if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
       Argument* Arg = AI;
       if (!isa<PointerType>(Arg->getType())) {
-        for (Instruction::use_iterator UI = Arg->use_begin();
-             UI != Arg->use_end();) {
+        for (auto UI = Arg->user_begin(); UI != Arg->user_end();) {
           if (isa<SExtInst>(*UI)) {
-            Instruction* Use = cast<Instruction>(*UI);
-            SExtInst* SI = new SExtInst(Arg, Use->getType());
+            Instruction* I = cast<Instruction>(*UI);
+            SExtInst* SI = new SExtInst(Arg, I->getType());
             assert (EVT::getEVT(SI->getType()) ==
-                    (EVT::getEVT(Use->getType())));
+                    (EVT::getEVT(I->getType())));
             ++UI;
-            Use->replaceAllUsesWith(SI);
+            I->replaceAllUsesWith(SI);
             Instruction* First = F.getEntryBlock().begin();
             SI->insertBefore(First);
-            Use->eraseFromParent();
+            I->eraseFromParent();
           } else {
             ++UI;
           }
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 5166f8e..5303f44 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -20,8 +20,8 @@
 #define DEBUG_TYPE "xfer"
 
 #include "HexagonTargetMachine.h"
-#include "HexagonSubtarget.h"
 #include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -33,13 +33,13 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include <map>
 
 using namespace llvm;
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index bb950a0..9ce1fb8 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -71,14 +71,11 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DL("e-p:32:32:32-"
-                "i64:64:64-i32:32:32-i16:16:16-i1:32:32-"
-                "f64:64:64-f32:32:32-a0:0-n32") ,
+    DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32") ,
     Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget),
     InstrItins(&Subtarget.getInstrItineraryData()) {
-    setMCUseCFI(false);
     initAsmInfo();
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 7773cff..c97526e 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -85,9 +85,10 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   return false;
 }
 
-const MCSection *HexagonTargetObjectFile::
-SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                       Mangler *Mang, const TargetMachine &TM) const {
+const MCSection *
+HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                SectionKind Kind, Mangler &Mang,
+                                                const TargetMachine &TM) const {
 
   // Handle Small Section classification here.
   if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 41f6792..1bd1272 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -19,7 +19,7 @@ namespace llvm {
     const MCSectionELF *SmallDataSection;
     const MCSectionELF *SmallBSSSection;
   public:
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
     /// IsGlobalInSmallSection - Return true if this global address should be
     /// placed into small data/bss section.
@@ -30,10 +30,9 @@ namespace llvm {
                                 const TargetMachine &TM) const;
 
     bool IsSmallDataEnabled () const;
-    const MCSection* SelectSectionForGlobal(const GlobalValue *GV,
-                                            SectionKind Kind,
-                                            Mangler *Mang,
-                                            const TargetMachine &TM) const;
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                        SectionKind Kind, Mangler &Mang,
+                                        const TargetMachine &TM) const override;
   };
 
 } // namespace llvm
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 41e382d..976ff2b 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -18,35 +18,34 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "packets"
 #include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/Passes.h"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/LatencyPriorityQueue.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-#include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
-#include "HexagonMachineFunctionInfo.h"
-
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include <map>
 #include <vector>
 
@@ -238,20 +237,20 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
+        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
           break;
       }
       I = MBB->begin();
 
       // Skip empty scheduling regions.
       if (I == RegionEnd) {
-        RegionEnd = llvm::prior(RegionEnd);
+        RegionEnd = std::prev(RegionEnd);
         --RemainingCount;
         continue;
       }
       // Skip regions with one instruction.
-      if (I == llvm::prior(RegionEnd)) {
-        RegionEnd = llvm::prior(RegionEnd);
+      if (I == std::prev(RegionEnd)) {
+        RegionEnd = std::prev(RegionEnd);
         continue;
       }
 
@@ -681,7 +680,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
     }
   }
 
-  // Make sure that for non POST_INC stores:
+  // Make sure that for non-POST_INC stores:
   // 1. The only use of reg is DepReg and no other registers.
   //    This handles V4 base+index registers.
   //    The following store can not be dot new.
diff --git a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
index c607b5d..668ca98 100644
--- a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
+++ b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
@@ -41,7 +41,7 @@ static bool CC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
   }
 
 
-  // Only assign registers for named (non varargs) arguments
+  // Only assign registers for named (non-varargs) arguments
   if ( !ForceMem && ((NonVarArgsParams == -1) || (CurrentParam <=
                                                   NonVarArgsParams))) {
 
diff --git a/lib/Target/Hexagon/InstPrinter/CMakeLists.txt b/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
index cb106a8..1ddaf9b 100644
--- a/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
@@ -1,5 +1,3 @@
 add_llvm_library(LLVMHexagonAsmPrinter
   HexagonInstPrinter.cpp
   )
-
-add_dependencies(LLVMHexagonAsmPrinter HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 7c41507..33667f4 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -16,10 +16,10 @@
 #include "Hexagon.h"
 #include "HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCInst.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index c6d419a..0cf9a06 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -28,5 +28,5 @@ has_asmprinter = 1
 type = Library
 name = HexagonCodeGen
 parent = Hexagon
-required_libraries = AsmPrinter CodeGen Core HexagonAsmPrinter HexagonDesc HexagonInfo MC SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core HexagonAsmPrinter HexagonDesc HexagonInfo MC Scalar SelectionDAG Support Target TransformUtils
 add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
index 62b9b60..eeef3ef 100644
--- a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
@@ -3,5 +3,3 @@ add_llvm_library(LLVMHexagonDesc
   HexagonMCInst.cpp
   HexagonMCTargetDesc.cpp
   )
-
-add_dependencies(LLVMHexagonDesc HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 3f9415b..f1a65c3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -26,7 +26,6 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   CommentString = "//";
   HasLEB128 = true;
 
-  PrivateGlobalPrefix = ".L";
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
   InlineAsmEnd = "# InlineAsm End";
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 2f93a52..7f103d8 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -14,12 +14,12 @@
 #include "HexagonMCTargetDesc.h"
 #include "HexagonMCAsmInfo.h"
 #include "InstPrinter/HexagonInstPrinter.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
diff --git a/lib/Target/Hexagon/TargetInfo/CMakeLists.txt b/lib/Target/Hexagon/TargetInfo/CMakeLists.txt
index 5b04a30..b9411f6 100644
--- a/lib/Target/Hexagon/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Hexagon/TargetInfo/CMakeLists.txt
@@ -1,8 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. 
-                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMHexagonInfo
   HexagonTargetInfo.cpp
   )
-
-add_dependencies(LLVMHexagonInfo HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt b/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt
index 7b87be3..095a4b0 100644
--- a/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Hexagon/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = HexagonInfo
 parent = Hexagon
-required_libraries = MC Support
+required_libraries = Support
 add_to_library_groups = Hexagon
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 98d26bc..13abaf8 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = AArch64 ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index c9b3c3d..a8f9b52 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -23,8 +23,6 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
-add_dependencies(LLVMMSP430CodeGen MSP430CommonTableGen intrinsics_gen)
-
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MSP430/InstPrinter/CMakeLists.txt b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
index 64ac994..580a9ce 100644
--- a/lib/Target/MSP430/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMSP430AsmPrinter
   MSP430InstPrinter.cpp
   )
-
-add_dependencies(LLVMMSP430AsmPrinter MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
index adc95c5..0f3ebd3 100644
--- a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
@@ -2,5 +2,3 @@ add_llvm_library(LLVMMSP430Desc
   MSP430MCTargetDesc.cpp
   MSP430MCAsmInfo.cpp
   )
-
-add_dependencies(LLVMMSP430Desc MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt b/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
index 3319d93..b8f3d02 100644
--- a/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MSP430Desc
 parent = MSP430
-required_libraries = MC MSP430AsmPrinter MSP430Info Support Target
+required_libraries = MC MSP430AsmPrinter MSP430Info
 add_to_library_groups = MSP430
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index acf2ab8..df1aa1a 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -20,7 +20,6 @@ void MSP430MCAsmInfo::anchor() { }
 MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
   PointerSize = CalleeSaveStackSlotSize = 2;
 
-  PrivateGlobalPrefix = ".L";
   CommentString = ";";
 
   AlignmentIsInBytes = false;
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
index c6796b3..dfea669 100644
--- a/lib/Target/MSP430/MSP430.td
+++ b/lib/Target/MSP430/MSP430.td
@@ -50,17 +50,11 @@ include "MSP430InstrInfo.td"
 
 def MSP430InstrInfo : InstrInfo;
 
-def MSP430InstPrinter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // Target Declaration
 //===----------------------------------------------------------------------===//
 
 def MSP430 : Target {
   let InstructionSet = MSP430InstrInfo;
-  let AssemblyWriters = [MSP430InstPrinter];
 }
 
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 18311c3..91065d8 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -18,7 +18,6 @@
 #include "MSP430InstrInfo.h"
 #include "MSP430MCInstLower.h"
 #include "MSP430TargetMachine.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -26,6 +25,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
@@ -33,7 +33,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 using namespace llvm;
 
 namespace {
@@ -99,12 +98,6 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
 
     return;
   }
-  case MachineOperand::MO_ExternalSymbol: {
-    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
-    O << (isMemOp ? '&' : '#');
-    O << MAI->getGlobalPrefix() << MO.getSymbolName();
-    return;
-  }
   }
 }
 
@@ -158,7 +151,7 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
-  OutStreamer.EmitInstruction(TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 // Force static initialization.
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index e504011..ce078a3 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -71,7 +71,7 @@ void MSP430FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(MSP430::SPW);
 
     // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+    for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(MSP430::FPW);
 
@@ -138,7 +138,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Skip the callee-saved pop instructions.
   while (MBBI != MBB.begin()) {
-    MachineBasicBlock::iterator PI = prior(MBBI);
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
     unsigned Opc = PI->getOpcode();
     if (Opc != MSP430::POP16r && !PI->isTerminator())
       break;
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 745cdf5..fe163d4 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -1047,6 +1047,9 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
 
@@ -1245,8 +1248,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
 
   // Update machine-CFG edges by transferring all successors of the current
   // block to the block containing instructions after shift.
-  RemBB->splice(RemBB->begin(), BB,
-                llvm::next(MachineBasicBlock::iterator(MI)),
+  RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
                 BB->end());
   RemBB->transferSuccessorsAndUpdatePHIs(BB);
 
@@ -1341,8 +1343,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
   copy1MBB->splice(copy1MBB->begin(), BB,
-                   llvm::next(MachineBasicBlock::iterator(MI)),
-                   BB->end());
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(copy0MBB);
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 7a0b00a..38f73b9 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -205,8 +205,8 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (llvm::next(I) != MBB.end())
-        llvm::next(I)->eraseFromParent();
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
       Cond.clear();
       FBB = 0;
 
@@ -299,7 +299,7 @@ unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   default:
     switch (Desc.getOpcode()) {
     default: llvm_unreachable("Unknown instruction size!");
-    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::CFI_INSTRUCTION:
     case TargetOpcode::EH_LABEL:
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 52f9ee5..05352a2 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -17,13 +17,15 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 MCSymbol *MSP430MCInstLower::
@@ -48,8 +50,9 @@ GetExternalSymbolSymbol(const MachineOperand &MO) const {
 
 MCSymbol *MSP430MCInstLower::
 GetJumpTableSymbol(const MachineOperand &MO) const {
+  const DataLayout *DL = Printer.TM.getDataLayout();
   SmallString<256> Name;
-  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
                             << Printer.getFunctionNumber() << '_'
                             << MO.getIndex();
 
@@ -64,8 +67,9 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
 
 MCSymbol *MSP430MCInstLower::
 GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  const DataLayout *DL = Printer.TM.getDataLayout();
   SmallString<256> Name;
-  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+  raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI"
                             << Printer.getFunctionNumber() << '_'
                             << MO.getIndex();
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 1a5e312..f64017e 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -88,8 +88,10 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(MSP430::CGW);
 
   // Mark frame pointer as reserved if needed.
-  if (TFI->hasFP(MF))
+  if (TFI->hasFP(MF)) {
+    Reserved.set(MSP430::FPB);
     Reserved.set(MSP430::FPW);
+  }
 
   return Reserved;
 }
@@ -142,10 +144,10 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // We need to materialize the offset via add instruction.
     unsigned DstReg = MI.getOperand(0).getReg();
     if (Offset < 0)
-      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+      BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
         .addReg(DstReg).addImm(-Offset);
     else
-      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+      BuildMI(MBB, std::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
         .addReg(DstReg).addImm(Offset);
 
     return;
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 6710a09..98a6003 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -34,7 +34,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     // FIXME: Check DataLayout string.
-    DL("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
+    DL("e-m:e-p:16:16-i32:16:32-n8:16"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
   initAsmInfo();
diff --git a/lib/Target/MSP430/TargetInfo/CMakeLists.txt b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
index f6b40ea..fee5f43 100644
--- a/lib/Target/MSP430/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMSP430Info
   MSP430TargetInfo.cpp
   )
-
-add_dependencies(LLVMMSP430Info MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/TargetInfo/LLVMBuild.txt b/lib/Target/MSP430/TargetInfo/LLVMBuild.txt
index deafc2d..ee41ae4 100644
--- a/lib/Target/MSP430/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/MSP430/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MSP430Info
 parent = MSP430
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = MSP430
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
deleted file mode 100644
index 38be25c..0000000
--- a/lib/Target/Mangler.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//===-- Mangler.cpp - Self-contained c/asm llvm name mangler --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Unified name mangler for assembly backends.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/Mangler.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
-/// and the specified name as the global variable name.  GVName must not be
-/// empty.
-void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
-                                const Twine &GVName, ManglerPrefixTy PrefixTy,
-                                bool UseGlobalPrefix) {
-  SmallString<256> TmpData;
-  StringRef Name = GVName.toStringRef(TmpData);
-  assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
-  
-  const MCAsmInfo *MAI = TM->getMCAsmInfo();
-  
-  // If the global name is not led with \1, add the appropriate prefixes.
-  if (Name[0] == '\1') {
-    Name = Name.substr(1);
-  } else {
-    if (PrefixTy == Mangler::Private) {
-      const char *Prefix = MAI->getPrivateGlobalPrefix();
-      OutName.append(Prefix, Prefix+strlen(Prefix));
-    } else if (PrefixTy == Mangler::LinkerPrivate) {
-      const char *Prefix = MAI->getLinkerPrivateGlobalPrefix();
-      OutName.append(Prefix, Prefix+strlen(Prefix));
-    }
-
-    if (UseGlobalPrefix) {
-      const char *Prefix = MAI->getGlobalPrefix();
-      if (Prefix[0] == 0)
-        ; // Common noop, no prefix.
-      else if (Prefix[1] == 0)
-        OutName.push_back(Prefix[0]);  // Common, one character prefix.
-      else
-        // Arbitrary length prefix.
-        OutName.append(Prefix, Prefix+strlen(Prefix));
-    }
-  }
-
-  // If this is a simple string that doesn't need escaping, just append it.
-  OutName.append(Name.begin(), Name.end());
-}
-
-/// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
-/// a suffix on their name indicating the number of words of arguments they
-/// take.
-static void AddFastCallStdCallSuffix(SmallVectorImpl<char> &OutName,
-                                     const Function *F, const DataLayout &TD) {
-  // Calculate arguments size total.
-  unsigned ArgWords = 0;
-  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-       AI != AE; ++AI) {
-    Type *Ty = AI->getType();
-    // 'Dereference' type in case of byval parameter attribute
-    if (AI->hasByValAttr())
-      Ty = cast<PointerType>(Ty)->getElementType();
-    // Size should be aligned to DWORD boundary
-    ArgWords += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
-  }
-  
-  raw_svector_ostream(OutName) << '@' << ArgWords;
-}
-
-
-/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
-/// and the specified global variable's name.  If the global variable doesn't
-/// have a name, this fills in a unique name for the global.
-void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
-                                const GlobalValue *GV, bool isImplicitlyPrivate,
-                                bool UseGlobalPrefix) {
-  ManglerPrefixTy PrefixTy = Mangler::Default;
-  if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
-    PrefixTy = Mangler::Private;
-  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage())
-    PrefixTy = Mangler::LinkerPrivate;
-  
-  // If this global has a name, handle it simply.
-  if (GV->hasName()) {
-    StringRef Name = GV->getName();
-    getNameWithPrefix(OutName, Name, PrefixTy, UseGlobalPrefix);
-    // No need to do anything else if the global has the special "do not mangle"
-    // flag in the name.
-    if (Name[0] == 1)
-      return;
-  } else {
-    // Get the ID for the global, assigning a new one if we haven't got one
-    // already.
-    unsigned &ID = AnonGlobalIDs[GV];
-    if (ID == 0) ID = NextAnonGlobalID++;
-  
-    // Must mangle the global into a unique ID.
-    getNameWithPrefix(OutName, "__unnamed_" + Twine(ID), PrefixTy,
-                      UseGlobalPrefix);
-  }
-  
-  // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
-  // add it.
-  if (TM->getMCAsmInfo()->hasMicrosoftFastStdCallMangling()) {
-    if (const Function *F = dyn_cast<Function>(GV)) {
-      CallingConv::ID CC = F->getCallingConv();
-    
-      // fastcall functions need to start with @.
-      // FIXME: This logic seems unlikely to be right.
-      if (CC == CallingConv::X86_FastCall) {
-        if (OutName[0] == '_')
-          OutName[0] = '@';
-        else
-          OutName.insert(OutName.begin(), '@');
-      }
-    
-      // fastcall and stdcall functions usually need @42 at the end to specify
-      // the argument info.
-      FunctionType *FT = F->getFunctionType();
-      if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
-          // "Pure" variadic functions do not receive @0 suffix.
-          (!FT->isVarArg() || FT->getNumParams() == 0 ||
-           (FT->getNumParams() == 1 && F->hasStructRetAttr())))
-        AddFastCallStdCallSuffix(OutName, F, *TM->getDataLayout());
-    }
-  }
-}
diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 4d49583..74b8a3b 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk
@@ -14,6 +14,7 @@ mips_codegen_TBLGEN_TABLES := \
 mips_codegen_SRC_FILES := \
   Mips16FrameLowering.cpp \
   Mips16HardFloat.cpp \
+  Mips16HardFloatInfo.cpp \
   Mips16ISelDAGToDAG.cpp \
   Mips16ISelLowering.cpp \
   Mips16InstrInfo.cpp \
@@ -33,6 +34,7 @@ mips_codegen_SRC_FILES := \
   MipsMCInstLower.cpp \
   MipsModuleISelDAGToDAG.cpp \
   MipsOs16.cpp \
+  MipsOptimizePICCall.cpp \
   MipsRegisterInfo.cpp \
   MipsSEFrameLowering.cpp \
   MipsSEISelDAGToDAG.cpp \
@@ -64,6 +66,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -79,3 +82,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/Mips/AsmParser/Android.mk b/lib/Target/Mips/AsmParser/Android.mk
index 7d1817d..7d8eec1 100644
--- a/lib/Target/Mips/AsmParser/Android.mk
+++ b/lib/Target/Mips/AsmParser/Android.mk
@@ -39,6 +39,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 #===---------------------------------------------------------------===
 # libLLVMMipsAsmParser (target)
 #===---------------------------------------------------------------===
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -52,3 +53,4 @@ TBLGEN_TD_DIR := $(LOCAL_PATH)/..
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/Mips/AsmParser/CMakeLists.txt b/lib/Target/Mips/AsmParser/CMakeLists.txt
index 28f5219..f167556 100644
--- a/lib/Target/Mips/AsmParser/CMakeLists.txt
+++ b/lib/Target/Mips/AsmParser/CMakeLists.txt
@@ -1,6 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 add_llvm_library(LLVMMipsAsmParser
   MipsAsmParser.cpp
   )
-
-add_dependencies(LLVMMipsAsmParser MipsCommonTableGen)
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index cdae6c2..911a119 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -7,21 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/ADT/APInt.h"
 
 using namespace llvm;
 
@@ -54,16 +58,14 @@ private:
 
 namespace {
 class MipsAsmParser : public MCTargetAsmParser {
-
   MipsTargetStreamer &getTargetStreamer() {
-    MCTargetStreamer &TS = Parser.getStreamer().getTargetStreamer();
+    MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
     return static_cast<MipsTargetStreamer &>(TS);
   }
 
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   MipsAssemblerOptions Options;
-  bool hasConsumedDollar;
 
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
@@ -73,8 +75,15 @@ class MipsAsmParser : public MCTargetAsmParser {
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
 
+  /// Parse a register as used in CFI directives
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
 
+  bool ParseParenSuffix(StringRef Name,
+                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  bool ParseBracketSuffix(StringRef Name,
+                          SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc,
                         SmallVectorImpl<MCParsedAsmOperand *> &Operands);
@@ -82,95 +91,36 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool ParseDirective(AsmToken DirectiveID);
 
   MipsAsmParser::OperandMatchResultTy
-  parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                   int RegKind);
-
-  MipsAsmParser::OperandMatchResultTy
   parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
-  bool parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                   int RegKind);
-
-  MipsAsmParser::OperandMatchResultTy
-  parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseFGRH32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy MatchAnyRegisterNameWithoutDollar(
+      SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier,
+      SMLoc S);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MatchAnyRegisterWithoutDollar(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                SMLoc S);
 
   MipsAsmParser::OperandMatchResultTy
-  parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  ParseAnyRegister(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseMSA128BRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseMSA128HRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseMSA128WRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseMSA128DRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy
-  parseMSA128CtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  ParseJumpTarget(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
   parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
-  bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                         unsigned RegKind);
+  bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &,
                     StringRef Mnemonic);
 
-  int tryParseRegister(bool is64BitReg);
-
-  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                               bool is64BitReg);
-
   bool needsExpansion(MCInst &Inst);
 
   void expandInstruction(MCInst &Inst, SMLoc IDLoc,
@@ -192,9 +142,10 @@ class MipsAsmParser : public MCTargetAsmParser {
   const MCExpr *evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
 
   bool isEvaluated(const MCExpr *Expr);
+  bool parseSetFeature(uint64_t Feature);
+  bool parseDirectiveCPSetup();
   bool parseDirectiveSet();
-  bool parseDirectiveMipsHackStocg();
-  bool parseDirectiveMipsHackELFFlags();
+  bool parseDirectiveOption();
 
   bool parseSetAtDirective();
   bool parseSetNoAtDirective();
@@ -202,25 +153,34 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetNoMacroDirective();
   bool parseSetReorderDirective();
   bool parseSetNoReorderDirective();
+  bool parseSetNoMips16Directive();
 
   bool parseSetAssignment();
 
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDataDirective(unsigned Size, SMLoc L);
   bool parseDirectiveGpWord();
+  bool parseDirectiveGpDWord();
 
   MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
 
-  bool isMips64() const {
-    return (STI.getFeatureBits() & Mips::FeatureMips64) != 0;
+  bool isGP64() const {
+    return (STI.getFeatureBits() & Mips::FeatureGP64Bit) != 0;
   }
 
   bool isFP64() const {
     return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
   }
 
+  bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
   bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
 
-  int matchRegisterName(StringRef Symbol, bool is64BitReg);
+  bool isMicroMips() const {
+    return STI.getFeatureBits() & Mips::FeatureMicroMips;
+  }
+
+  bool parseRegister(unsigned &RegNum);
+
+  bool eatComma(StringRef ErrorStr);
 
   int matchCPURegisterName(StringRef Symbol);
 
@@ -236,10 +196,10 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   int matchMSA128CtrlRegisterName(StringRef Name);
 
-  int regKindToRegClass(int RegKind);
-
   unsigned getReg(int RC, int RegNo);
 
+  unsigned getGPR(int RegNo);
+
   int getATReg();
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc,
@@ -250,17 +210,39 @@ class MipsAsmParser : public MCTargetAsmParser {
   // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
   bool validateMSAIndex(int Val, int RegKind);
 
+  void setFeatureBits(unsigned Feature, StringRef FeatureString) {
+    if (!(STI.getFeatureBits() & Feature)) {
+      setAvailableFeatures(ComputeAvailableFeatures(
+                           STI.ToggleFeature(FeatureString)));
+    }
+  }
+
+  void clearFeatureBits(unsigned Feature, StringRef FeatureString) {
+    if (STI.getFeatureBits() & Feature) {
+     setAvailableFeatures(ComputeAvailableFeatures(
+                           STI.ToggleFeature(FeatureString)));
+    }
+  }
+
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII)
-      : MCTargetAsmParser(), STI(sti), Parser(parser),
-        hasConsumedDollar(false) {
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    // Assert exactly one ABI was chosen.
+    assert((((STI.getFeatureBits() & Mips::FeatureO32) != 0) +
+            ((STI.getFeatureBits() & Mips::FeatureEABI) != 0) +
+            ((STI.getFeatureBits() & Mips::FeatureN32) != 0) +
+            ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
   }
 
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  /// Warn if RegNo is the current assembler temporary.
+  void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc);
 };
 }
 
@@ -269,53 +251,56 @@ namespace {
 /// MipsOperand - Instances of this class represent a parsed Mips machine
 /// instruction.
 class MipsOperand : public MCParsedAsmOperand {
-
 public:
-  enum RegisterKind {
-    Kind_None,
-    Kind_GPR32,
-    Kind_GPR64,
-    Kind_HWRegs,
-    Kind_FGR32Regs,
-    Kind_FGRH32Regs,
-    Kind_FGR64Regs,
-    Kind_AFGR64Regs,
-    Kind_CCRRegs,
-    Kind_FCCRegs,
-    Kind_ACC64DSP,
-    Kind_LO32DSP,
-    Kind_HI32DSP,
-    Kind_COP2,
-    Kind_MSA128BRegs,
-    Kind_MSA128HRegs,
-    Kind_MSA128WRegs,
-    Kind_MSA128DRegs,
-    Kind_MSA128CtrlRegs
+  /// Broad categories of register classes
+  /// The exact class is finalized by the render method.
+  enum RegKind {
+    RegKind_GPR = 1,      /// GPR32 and GPR64 (depending on isGP64())
+    RegKind_FGR = 2,      /// FGR32, FGR64, AFGR64 (depending on context and
+                          /// isFP64())
+    RegKind_FCC = 4,      /// FCC
+    RegKind_MSA128 = 8,   /// MSA128[BHWD] (makes no difference which)
+    RegKind_MSACtrl = 16, /// MSA control registers
+    RegKind_COP2 = 32,    /// COP2
+    RegKind_ACC = 64,     /// HI32DSP, LO32DSP, and ACC64DSP (depending on
+                          /// context).
+    RegKind_CCR = 128,    /// CCR
+    RegKind_HWRegs = 256, /// HWRegs
+
+    /// Potentially any (e.g. $1)
+    RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 |
+                      RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC |
+                      RegKind_CCR | RegKind_HWRegs
   };
 
 private:
   enum KindTy {
-    k_CondCode,
-    k_CoprocNum,
-    k_Immediate,
-    k_Memory,
-    k_PostIndexRegister,
-    k_Register,
-    k_PtrReg,
-    k_Token,
-    k_LSAImm
+    k_Immediate,     /// An immediate (possibly involving symbol references)
+    k_Memory,        /// Base + Offset Memory Address
+    k_PhysRegister,  /// A physical register from the Mips namespace
+    k_RegisterIndex, /// A register index in one or more RegKind.
+    k_Token          /// A simple token
   } Kind;
 
-  MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  MipsOperand(KindTy K, MipsAsmParser &Parser)
+      : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
+
+  /// For diagnostics, and checking the assembler temporary
+  MipsAsmParser &AsmParser;
 
   struct Token {
     const char *Data;
     unsigned Length;
   };
 
-  struct RegOp {
-    unsigned RegNum;
-    RegisterKind Kind;
+  struct PhysRegOp {
+    unsigned Num; /// Register Number
+  };
+
+  struct RegIdxOp {
+    unsigned Index; /// Index into the register class
+    RegKind Kind;   /// Bitfield of the kinds it could possibly be
+    const MCRegisterInfo *RegInfo;
   };
 
   struct ImmOp {
@@ -323,30 +308,161 @@ private:
   };
 
   struct MemOp {
-    unsigned Base;
+    MipsOperand *Base;
     const MCExpr *Off;
   };
 
   union {
     struct Token Tok;
-    struct RegOp Reg;
+    struct PhysRegOp PhysReg;
+    struct RegIdxOp RegIdx;
     struct ImmOp Imm;
     struct MemOp Mem;
   };
 
   SMLoc StartLoc, EndLoc;
 
+  /// Internal constructor for register kinds
+  static MipsOperand *CreateReg(unsigned Index, RegKind RegKind,
+                                const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+                                MipsAsmParser &Parser) {
+    MipsOperand *Op = new MipsOperand(k_RegisterIndex, Parser);
+    Op->RegIdx.Index = Index;
+    Op->RegIdx.RegInfo = RegInfo;
+    Op->RegIdx.Kind = RegKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
 public:
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  /// Coerce the register to GPR32 and return the real register for the current
+  /// target.
+  unsigned getGPR32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    AsmParser.WarnIfAssemblerTemporary(RegIdx.Index, StartLoc);
+    unsigned ClassID = Mips::GPR32RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
   }
 
-  void addPtrRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getPtrReg()));
+  /// Coerce the register to GPR64 and return the real register for the current
+  /// target.
+  unsigned getGPR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    unsigned ClassID = Mips::GPR64RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+private:
+  /// Coerce the register to AFGR64 and return the real register for the current
+  /// target.
+  unsigned getAFGR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    if (RegIdx.Index % 2 != 0)
+      AsmParser.Warning(StartLoc, "Float register should be even.");
+    return RegIdx.RegInfo->getRegClass(Mips::AFGR64RegClassID)
+        .getRegister(RegIdx.Index / 2);
+  }
+
+  /// Coerce the register to FGR64 and return the real register for the current
+  /// target.
+  unsigned getFGR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGR64RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FGR32 and return the real register for the current
+  /// target.
+  unsigned getFGR32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGR32RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FGRH32 and return the real register for the current
+  /// target.
+  unsigned getFGRH32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGRH32RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FCC and return the real register for the current
+  /// target.
+  unsigned getFCCReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FCC) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FCCRegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to MSA128 and return the real register for the current
+  /// target.
+  unsigned getMSA128Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_MSA128) && "Invalid access!");
+    // It doesn't matter which of the MSA128[BHWD] classes we use. They are all
+    // identical
+    unsigned ClassID = Mips::MSA128BRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to MSACtrl and return the real register for the
+  /// current target.
+  unsigned getMSACtrlReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_MSACtrl) && "Invalid access!");
+    unsigned ClassID = Mips::MSACtrlRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to COP2 and return the real register for the
+  /// current target.
+  unsigned getCOP2Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP2) && "Invalid access!");
+    unsigned ClassID = Mips::COP2RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to ACC64DSP and return the real register for the
+  /// current target.
+  unsigned getACC64DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::ACC64DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to HI32DSP and return the real register for the
+  /// current target.
+  unsigned getHI32DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::HI32DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to LO32DSP and return the real register for the
+  /// current target.
+  unsigned getLO32DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::LO32DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to CCR and return the real register for the
+  /// current target.
+  unsigned getCCRReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_CCR) && "Invalid access!");
+    unsigned ClassID = Mips::CCRRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to HWRegs and return the real register for the
+  /// current target.
+  unsigned getHWRegsReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_HWRegs) && "Invalid access!");
+    unsigned ClassID = Mips::HWRegsRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
   }
 
+public:
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediate when possible.  Null MCExpr = 0.
     if (Expr == 0)
@@ -357,6 +473,91 @@ public:
       Inst.addOperand(MCOperand::CreateExpr(Expr));
   }
 
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("Use a custom parser instead");
+  }
+
+  /// Render the operand to an MCInst as a GPR32
+  /// Asserts if the wrong number of operands are requested, or the operand
+  /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPR32Reg()));
+  }
+
+  /// Render the operand to an MCInst as a GPR64
+  /// Asserts if the wrong number of operands are requested, or the operand
+  /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPR64Reg()));
+  }
+
+  void addAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getAFGR64Reg()));
+  }
+
+  void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFGR64Reg()));
+  }
+
+  void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFGR32Reg()));
+  }
+
+  void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFGRH32Reg()));
+  }
+
+  void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getFCCReg()));
+  }
+
+  void addMSA128AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMSA128Reg()));
+  }
+
+  void addMSACtrlAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMSACtrlReg()));
+  }
+
+  void addCOP2AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCOP2Reg()));
+  }
+
+  void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getACC64DSPReg()));
+  }
+
+  void addHI32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getHI32DSPReg()));
+  }
+
+  void addLO32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getLO32DSPReg()));
+  }
+
+  void addCCRAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCCRReg()));
+  }
+
+  void addHWRegsAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getHWRegsReg()));
+  }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCExpr *Expr = getImm();
@@ -366,19 +567,38 @@ public:
   void addMemOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
 
-    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()->getGPR32Reg()));
 
     const MCExpr *Expr = getMemOff();
     addExpr(Inst, Expr);
   }
 
-  bool isReg() const { return Kind == k_Register; }
+  bool isReg() const {
+    // As a special case until we sort out the definition of div/divu, pretend
+    // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
+    if (isGPRAsmReg() && RegIdx.Index == 0)
+      return true;
+
+    return Kind == k_PhysRegister;
+  }
+  bool isRegIdx() const { return Kind == k_RegisterIndex; }
   bool isImm() const { return Kind == k_Immediate; }
-  bool isToken() const { return Kind == k_Token; }
+  bool isConstantImm() const {
+    return isImm() && dyn_cast<MCConstantExpr>(getImm());
+  }
+  bool isToken() const {
+    // Note: It's not possible to pretend that other operand kinds are tokens.
+    // The matcher emitter checks tokens first.
+    return Kind == k_Token;
+  }
   bool isMem() const { return Kind == k_Memory; }
-  bool isPtrReg() const { return Kind == k_PtrReg; }
   bool isInvNum() const { return Kind == k_Immediate; }
-  bool isLSAImm() const { return Kind == k_LSAImm; }
+  bool isLSAImm() const {
+    if (!isConstantImm())
+      return false;
+    int64_t Val = getConstantImm();
+    return 1 <= Val && Val <= 4;
+  }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
@@ -386,26 +606,27 @@ public:
   }
 
   unsigned getReg() const {
-    assert((Kind == k_Register) && "Invalid access!");
-    return Reg.RegNum;
-  }
+    // As a special case until we sort out the definition of div/divu, pretend
+    // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
+    if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
+        RegIdx.Kind & RegKind_GPR)
+      return getGPR32Reg(); // FIXME: GPR64 too
 
-  unsigned getPtrReg() const {
-    assert((Kind == k_PtrReg) && "Invalid access!");
-    return Reg.RegNum;
-  }
-
-  void setRegKind(RegisterKind RegKind) {
-    assert((Kind == k_Register || Kind == k_PtrReg) && "Invalid access!");
-    Reg.Kind = RegKind;
+    assert(Kind == k_PhysRegister && "Invalid access!");
+    return PhysReg.Num;
   }
 
   const MCExpr *getImm() const {
-    assert((Kind == k_Immediate || Kind == k_LSAImm) && "Invalid access!");
+    assert((Kind == k_Immediate) && "Invalid access!");
     return Imm.Val;
   }
 
-  unsigned getMemBase() const {
+  int64_t getConstantImm() const {
+    const MCExpr *Val = getImm();
+    return static_cast<const MCConstantExpr *>(Val)->getValue();
+  }
+
+  MipsOperand *getMemBase() const {
     assert((Kind == k_Memory) && "Invalid access!");
     return Mem.Base;
   }
@@ -415,8 +636,9 @@ public:
     return Mem.Off;
   }
 
-  static MipsOperand *CreateToken(StringRef Str, SMLoc S) {
-    MipsOperand *Op = new MipsOperand(k_Token);
+  static MipsOperand *CreateToken(StringRef Str, SMLoc S,
+                                  MipsAsmParser &Parser) {
+    MipsOperand *Op = new MipsOperand(k_Token, Parser);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -424,41 +646,75 @@ public:
     return Op;
   }
 
-  static MipsOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    MipsOperand *Op = new MipsOperand(k_Register);
-    Op->Reg.RegNum = RegNum;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
+  /// Create a numeric register (e.g. $1). The exact register remains
+  /// unresolved until an instruction successfully matches
+  static MipsOperand *CreateNumericReg(unsigned Index,
+                                       const MCRegisterInfo *RegInfo, SMLoc S,
+                                       SMLoc E, MipsAsmParser &Parser) {
+    DEBUG(dbgs() << "CreateNumericReg(" << Index << ", ...)\n");
+    return CreateReg(Index, RegKind_Numeric, RegInfo, S, E, Parser);
   }
 
-  static MipsOperand *CreatePtrReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    MipsOperand *Op = new MipsOperand(k_PtrReg);
-    Op->Reg.RegNum = RegNum;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
+  /// Create a register that is definitely a GPR.
+  /// This is typically only used for named registers such as $gp.
+  static MipsOperand *CreateGPRReg(unsigned Index,
+                                   const MCRegisterInfo *RegInfo, SMLoc S,
+                                   SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_GPR, RegInfo, S, E, Parser);
   }
 
-  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    MipsOperand *Op = new MipsOperand(k_Immediate);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
+  /// Create a register that is definitely a FGR.
+  /// This is typically only used for named registers such as $f0.
+  static MipsOperand *CreateFGRReg(unsigned Index,
+                                   const MCRegisterInfo *RegInfo, SMLoc S,
+                                   SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_FGR, RegInfo, S, E, Parser);
   }
 
-  static MipsOperand *CreateLSAImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    MipsOperand *Op = new MipsOperand(k_LSAImm);
+  /// Create a register that is definitely an FCC.
+  /// This is typically only used for named registers such as $fcc0.
+  static MipsOperand *CreateFCCReg(unsigned Index,
+                                   const MCRegisterInfo *RegInfo, SMLoc S,
+                                   SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_FCC, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an ACC.
+  /// This is typically only used for named registers such as $ac0.
+  static MipsOperand *CreateACCReg(unsigned Index,
+                                   const MCRegisterInfo *RegInfo, SMLoc S,
+                                   SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_ACC, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an MSA128.
+  /// This is typically only used for named registers such as $w0.
+  static MipsOperand *CreateMSA128Reg(unsigned Index,
+                                      const MCRegisterInfo *RegInfo, SMLoc S,
+                                      SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_MSA128, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an MSACtrl.
+  /// This is typically only used for named registers such as $msaaccess.
+  static MipsOperand *CreateMSACtrlReg(unsigned Index,
+                                       const MCRegisterInfo *RegInfo, SMLoc S,
+                                       SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_MSACtrl, RegInfo, S, E, Parser);
+  }
+
+  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
+                                MipsAsmParser &Parser) {
+    MipsOperand *Op = new MipsOperand(k_Immediate, Parser);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
-                                SMLoc S, SMLoc E) {
-    MipsOperand *Op = new MipsOperand(k_Memory);
+  static MipsOperand *CreateMem(MipsOperand *Base, const MCExpr *Off, SMLoc S,
+                                SMLoc E, MipsAsmParser &Parser) {
+    MipsOperand *Op = new MipsOperand(k_Memory, Parser);
     Op->Mem.Base = Base;
     Op->Mem.Off = Off;
     Op->StartLoc = S;
@@ -466,79 +722,33 @@ public:
     return Op;
   }
 
-  bool isGPR32Asm() const {
-    return Kind == k_Register && Reg.Kind == Kind_GPR32;
+  bool isGPRAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
   }
-  void addRegAsmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  bool isFGRAsmReg() const {
+    // AFGR64 is $0-$15 but we handle this in getAFGR64()
+    return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
   }
-
-  bool isGPR64Asm() const {
-    return Kind == k_Register && Reg.Kind == Kind_GPR64;
+  bool isHWRegsAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
   }
-
-  bool isHWRegsAsm() const {
-    assert((Kind == k_Register) && "Invalid access!");
-    return Reg.Kind == Kind_HWRegs;
+  bool isCCRAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
   }
-
-  bool isCCRAsm() const {
-    assert((Kind == k_Register) && "Invalid access!");
-    return Reg.Kind == Kind_CCRRegs;
+  bool isFCCAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_FCC && RegIdx.Index <= 7;
   }
-
-  bool isAFGR64Asm() const {
-    return Kind == k_Register && Reg.Kind == Kind_AFGR64Regs;
-  }
-
-  bool isFGR64Asm() const {
-    return Kind == k_Register && Reg.Kind == Kind_FGR64Regs;
-  }
-
-  bool isFGR32Asm() const {
-    return (Kind == k_Register) && Reg.Kind == Kind_FGR32Regs;
-  }
-
-  bool isFGRH32Asm() const {
-    return (Kind == k_Register) && Reg.Kind == Kind_FGRH32Regs;
-  }
-
-  bool isFCCRegsAsm() const {
-    return (Kind == k_Register) && Reg.Kind == Kind_FCCRegs;
-  }
-
-  bool isACC64DSPAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_ACC64DSP;
+  bool isACCAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
   }
-
-  bool isLO32DSPAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_LO32DSP;
-  }
-
-  bool isHI32DSPAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_HI32DSP;
-  }
-
-  bool isCOP2Asm() const { return Kind == k_Register && Reg.Kind == Kind_COP2; }
-
-  bool isMSA128BAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_MSA128BRegs;
-  }
-
-  bool isMSA128HAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_MSA128HRegs;
-  }
-
-  bool isMSA128WAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_MSA128WRegs;
+  bool isCOP2AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
   }
-
-  bool isMSA128DAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_MSA128DRegs;
+  bool isMSA128AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
   }
-
-  bool isMSA128CRAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_MSA128CtrlRegs;
+  bool isMSACtrlAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
@@ -547,7 +757,29 @@ public:
   SMLoc getEndLoc() const { return EndLoc; }
 
   virtual void print(raw_ostream &OS) const {
-    llvm_unreachable("unimplemented!");
+    switch (Kind) {
+    case k_Immediate:
+      OS << "Imm<";
+      Imm.Val->print(OS);
+      OS << ">";
+      break;
+    case k_Memory:
+      OS << "Mem<";
+      Mem.Base->print(OS);
+      OS << ", ";
+      Mem.Off->print(OS);
+      OS << ">";
+      break;
+    case k_PhysRegister:
+      OS << "PhysReg<" << PhysReg.Num << ">";
+      break;
+    case k_RegisterIndex:
+      OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ">";
+      break;
+    case k_Token:
+      OS << Tok.Data;
+      break;
+    }
   }
 }; // class MipsOperand
 } // namespace
@@ -562,7 +794,57 @@ static const MCInstrDesc &getInstDesc(unsigned Opcode) {
 bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                        SmallVectorImpl<MCInst> &Instructions) {
   const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+
   Inst.setLoc(IDLoc);
+
+  if (MCID.isBranch() || MCID.isCall()) {
+    const unsigned Opcode = Inst.getOpcode();
+    MCOperand Offset;
+
+    switch (Opcode) {
+    default:
+      break;
+    case Mips::BEQ:
+    case Mips::BNE:
+    case Mips::BEQ_MM:
+    case Mips::BNE_MM:
+      assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+      Offset = Inst.getOperand(2);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2)))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BGEZ:
+    case Mips::BGTZ:
+    case Mips::BLEZ:
+    case Mips::BLTZ:
+    case Mips::BGEZAL:
+    case Mips::BLTZAL:
+    case Mips::BC1F:
+    case Mips::BC1T:
+    case Mips::BGEZ_MM:
+    case Mips::BGTZ_MM:
+    case Mips::BLEZ_MM:
+    case Mips::BLTZ_MM:
+    case Mips::BGEZAL_MM:
+    case Mips::BLTZAL_MM:
+    case Mips::BC1F_MM:
+    case Mips::BC1T_MM:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2)))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    }
+  }
+
   if (MCID.hasDelaySlot() && Options.isReorder()) {
     // If this instruction has a delay slot and .set reorder is active,
     // emit a NOP after it.
@@ -624,6 +906,10 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
   case Mips::LoadImm32Reg:
   case Mips::LoadAddr32Imm:
   case Mips::LoadAddr32Reg:
+  case Mips::SUBi:
+  case Mips::SUBiu:
+  case Mips::DSUBi:
+  case Mips::DSUBiu:
     return true;
   default:
     return false;
@@ -639,6 +925,30 @@ void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
     return expandLoadAddressImm(Inst, IDLoc, Instructions);
   case Mips::LoadAddr32Reg:
     return expandLoadAddressReg(Inst, IDLoc, Instructions);
+  case Mips::SUBi:
+    Instructions.push_back(MCInstBuilder(Mips::ADDi)
+                               .addReg(Inst.getOperand(0).getReg())
+                               .addReg(Inst.getOperand(1).getReg())
+                               .addImm(-Inst.getOperand(2).getImm()));
+    return;
+  case Mips::SUBiu:
+    Instructions.push_back(MCInstBuilder(Mips::ADDiu)
+                               .addReg(Inst.getOperand(0).getReg())
+                               .addReg(Inst.getOperand(1).getReg())
+                               .addImm(-Inst.getOperand(2).getImm()));
+    return;
+  case Mips::DSUBi:
+    Instructions.push_back(MCInstBuilder(Mips::DADDi)
+                               .addReg(Inst.getOperand(0).getReg())
+                               .addReg(Inst.getOperand(1).getReg())
+                               .addImm(-Inst.getOperand(2).getImm()));
+    return;
+  case Mips::DSUBiu:
+    Instructions.push_back(MCInstBuilder(Mips::DADDiu)
+                               .addReg(Inst.getOperand(0).getReg())
+                               .addReg(Inst.getOperand(1).getReg())
+                               .addImm(-Inst.getOperand(2).getImm()));
+    return;
   }
 }
 
@@ -772,7 +1082,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
   unsigned AtRegNum = getReg(
-      (isMips64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
+      (isGP64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
   // 1st operand is either the source or destination register.
   assert(Inst.getOperand(0).isReg() && "expected register operand kind");
   unsigned RegOpNum = Inst.getOperand(0).getReg();
@@ -823,7 +1133,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.addOperand(MCOperand::CreateReg(BaseRegNum));
   Instructions.push_back(TempInst);
   TempInst.clear();
-  // And finaly, create original instruction with low part
+  // And finally, create original instruction with low part
   // of offset and new base.
   TempInst.setOpcode(Inst.getOpcode());
   TempInst.addOperand(MCOperand::CreateReg(RegOpNum));
@@ -861,7 +1171,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(
     if (processInstruction(Inst, IDLoc, Instructions))
       return true;
     for (unsigned i = 0; i < Instructions.size(); i++)
-      Out.EmitInstruction(Instructions[i]);
+      Out.EmitInstruction(Instructions[i], STI);
     return false;
   }
   case Match_MissingFeature:
@@ -886,14 +1196,22 @@ bool MipsAsmParser::MatchAndEmitInstruction(
   return true;
 }
 
+void MipsAsmParser::WarnIfAssemblerTemporary(int RegIndex, SMLoc Loc) {
+  if ((RegIndex != 0) && ((int)Options.getATRegNum() == RegIndex)) {
+    if (RegIndex == 1)
+      Warning(Loc, "Used $at without \".set noat\"");
+    else
+      Warning(Loc, Twine("Used $") + Twine(RegIndex) + " with \".set at=$" +
+                       Twine(RegIndex) + "\"");
+  }
+}
+
 int MipsAsmParser::matchCPURegisterName(StringRef Name) {
   int CC;
 
-  if (Name == "at")
-    return getATReg();
-
   CC = StringSwitch<unsigned>(Name)
            .Case("zero", 0)
+           .Case("at", 1)
            .Case("a0", 4)
            .Case("a1", 5)
            .Case("a2", 6)
@@ -910,9 +1228,10 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
            .Case("s7", 23)
            .Case("k0", 26)
            .Case("k1", 27)
+           .Case("gp", 28)
            .Case("sp", 29)
            .Case("fp", 30)
-           .Case("gp", 28)
+           .Case("s8", 30)
            .Case("ra", 31)
            .Case("t0", 8)
            .Case("t1", 9)
@@ -926,22 +1245,23 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
            .Case("t9", 25)
            .Default(-1);
 
-  // Although SGI documentation just cuts out t0-t3 for n32/n64,
-  // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
-  // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
-  if (isMips64() && 8 <= CC && CC <= 11)
-    CC += 4;
-
-  if (CC == -1 && isMips64())
-    CC = StringSwitch<unsigned>(Name)
-             .Case("a4", 8)
-             .Case("a5", 9)
-             .Case("a6", 10)
-             .Case("a7", 11)
-             .Case("kt0", 26)
-             .Case("kt1", 27)
-             .Case("s8", 30)
-             .Default(-1);
+  if (isN32() || isN64()) {
+    // Although SGI documentation just cuts out t0-t3 for n32/n64,
+    // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+    // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+    if (8 <= CC && CC <= 11)
+      CC += 4;
+
+    if (CC == -1)
+      CC = StringSwitch<unsigned>(Name)
+               .Case("a4", 8)
+               .Case("a5", 9)
+               .Case("a6", 10)
+               .Case("a7", 11)
+               .Case("kt0", 26)
+               .Case("kt1", 27)
+               .Default(-1);
+  }
 
   return CC;
 }
@@ -1017,59 +1337,6 @@ int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
   return CC;
 }
 
-int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
-
-  int CC;
-  CC = matchCPURegisterName(Name);
-  if (CC != -1)
-    return matchRegisterByNumber(CC, is64BitReg ? Mips::GPR64RegClassID
-                                                : Mips::GPR32RegClassID);
-  CC = matchFPURegisterName(Name);
-  // TODO: decide about fpu register class
-  if (CC != -1)
-    return matchRegisterByNumber(CC, isFP64() ? Mips::FGR64RegClassID
-                                              : Mips::FGR32RegClassID);
-  return matchMSA128RegisterName(Name);
-}
-
-int MipsAsmParser::regKindToRegClass(int RegKind) {
-
-  switch (RegKind) {
-  case MipsOperand::Kind_GPR32:
-    return Mips::GPR32RegClassID;
-  case MipsOperand::Kind_GPR64:
-    return Mips::GPR64RegClassID;
-  case MipsOperand::Kind_HWRegs:
-    return Mips::HWRegsRegClassID;
-  case MipsOperand::Kind_FGR32Regs:
-    return Mips::FGR32RegClassID;
-  case MipsOperand::Kind_FGRH32Regs:
-    return Mips::FGRH32RegClassID;
-  case MipsOperand::Kind_FGR64Regs:
-    return Mips::FGR64RegClassID;
-  case MipsOperand::Kind_AFGR64Regs:
-    return Mips::AFGR64RegClassID;
-  case MipsOperand::Kind_CCRRegs:
-    return Mips::CCRRegClassID;
-  case MipsOperand::Kind_ACC64DSP:
-    return Mips::ACC64DSPRegClassID;
-  case MipsOperand::Kind_FCCRegs:
-    return Mips::FCCRegClassID;
-  case MipsOperand::Kind_MSA128BRegs:
-    return Mips::MSA128BRegClassID;
-  case MipsOperand::Kind_MSA128HRegs:
-    return Mips::MSA128HRegClassID;
-  case MipsOperand::Kind_MSA128WRegs:
-    return Mips::MSA128WRegClassID;
-  case MipsOperand::Kind_MSA128DRegs:
-    return Mips::MSA128DRegClassID;
-  case MipsOperand::Kind_MSA128CtrlRegs:
-    return Mips::MSACtrlRegClassID;
-  default:
-    return -1;
-  }
-}
-
 bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   if (Reg > 31)
     return false;
@@ -1078,53 +1345,35 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-int MipsAsmParser::getATReg() { return Options.getATRegNum(); }
+int MipsAsmParser::getATReg() {
+  int AT = Options.getATRegNum();
+  if (AT == 0)
+    TokError("Pseudo instruction requires $at, which is not available");
+  return AT;
+}
 
 unsigned MipsAsmParser::getReg(int RC, int RegNo) {
   return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
 }
 
+unsigned MipsAsmParser::getGPR(int RegNo) {
+  return getReg(isGP64() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID,
+                RegNo);
+}
+
 int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
   if (RegNum >
-      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs())
+      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs() - 1)
     return -1;
 
   return getReg(RegClass, RegNum);
 }
 
-int MipsAsmParser::tryParseRegister(bool is64BitReg) {
-  const AsmToken &Tok = Parser.getTok();
-  int RegNum = -1;
-
-  if (Tok.is(AsmToken::Identifier)) {
-    std::string lowerCase = Tok.getString().lower();
-    RegNum = matchRegisterName(lowerCase, is64BitReg);
-  } else if (Tok.is(AsmToken::Integer))
-    RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
-                                   is64BitReg ? Mips::GPR64RegClassID
-                                              : Mips::GPR32RegClassID);
-  return RegNum;
-}
-
-bool MipsAsmParser::tryParseRegisterOperand(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, bool is64BitReg) {
-
-  SMLoc S = Parser.getTok().getLoc();
-  int RegNo = -1;
-
-  RegNo = tryParseRegister(is64BitReg);
-  if (RegNo == -1)
-    return true;
-
-  Operands.push_back(
-      MipsOperand::CreateReg(RegNo, S, Parser.getTok().getLoc()));
-  Parser.Lex(); // Eat register token.
-  return false;
-}
-
 bool
 MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                             StringRef Mnemonic) {
+  DEBUG(dbgs() << "ParseOperand\n");
+
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -1136,6 +1385,8 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
   if (ResTy == MatchOperand_ParseFail)
     return true;
 
+  DEBUG(dbgs() << ".. Generic Parser\n");
+
   switch (getLexer().getKind()) {
   default:
     Error(Parser.getTok().getLoc(), "unexpected token in operand");
@@ -1143,29 +1394,15 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
   case AsmToken::Dollar: {
     // Parse the register.
     SMLoc S = Parser.getTok().getLoc();
-    Parser.Lex(); // Eat dollar token.
-    // Parse the register operand.
-    if (!tryParseRegisterOperand(Operands, isMips64())) {
-      if (getLexer().is(AsmToken::LParen)) {
-        // Check if it is indexed addressing operand.
-        Operands.push_back(MipsOperand::CreateToken("(", S));
-        Parser.Lex(); // Eat the parenthesis.
-        if (getLexer().isNot(AsmToken::Dollar))
-          return true;
-
-        Parser.Lex(); // Eat the dollar
-        if (tryParseRegisterOperand(Operands, isMips64()))
-          return true;
 
-        if (!getLexer().is(AsmToken::RParen))
-          return true;
-
-        S = Parser.getTok().getLoc();
-        Operands.push_back(MipsOperand::CreateToken(")", S));
-        Parser.Lex();
-      }
+    // Almost all registers have been parsed by custom parsers. There is only
+    // one exception to this. $zero (and it's alias $0) will reach this point
+    // for div, divu, and similar instructions because it is not an operand
+    // to the instruction definition but an explicit register. Special case
+    // this situation for now.
+    if (ParseAnyRegister(Operands) != MatchOperand_NoMatch)
       return false;
-    }
+
     // Maybe it is a symbol reference.
     StringRef Identifier;
     if (Parser.parseIdentifier(Identifier))
@@ -1177,47 +1414,18 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
     const MCExpr *Res =
         MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
 
-    Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this));
     return false;
   }
-  case AsmToken::Identifier:
-    // For instruction aliases like "bc1f $Label" dedicated parser will
-    // eat the '$' sign before failing. So in order to look for appropriate
-    // label we must check first if we have already consumed '$'.
-    if (hasConsumedDollar) {
-      hasConsumedDollar = false;
-      SMLoc S = Parser.getTok().getLoc();
-      StringRef Identifier;
-      if (Parser.parseIdentifier(Identifier))
-        return true;
-      SMLoc E =
-          SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-      MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
-      // Create a symbol reference.
-      const MCExpr *Res =
-          MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
-
-      Operands.push_back(MipsOperand::CreateImm(Res, S, E));
-      return false;
-    }
-    // Look for the existing symbol, we should check if
-    // we need to assigne the propper RegisterKind.
-    if (searchSymbolAlias(Operands, MipsOperand::Kind_None))
-      return false;
   // Else drop to expression parsing.
   case AsmToken::LParen:
   case AsmToken::Minus:
   case AsmToken::Plus:
   case AsmToken::Integer:
   case AsmToken::String: {
-    // Quoted label names.
-    const MCExpr *IdVal;
-    SMLoc S = Parser.getTok().getLoc();
-    if (getParser().parseExpression(IdVal))
-      return true;
-    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
-    return false;
+    DEBUG(dbgs() << ".. generic integer\n");
+    OperandMatchResultTy ResTy = ParseImm(Operands);
+    return ResTy != MatchOperand_Success;
   }
   case AsmToken::Percent: {
     // It is a symbol reference or constant expression.
@@ -1228,7 +1436,7 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
 
     SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
-    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
     return false;
   } // case AsmToken::Percent
   } // switch(getLexer().getKind())
@@ -1240,23 +1448,30 @@ const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
   const MCExpr *Res;
   // Check the type of the expression.
   if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Expr)) {
-    // It's a constant, evaluate lo or hi value.
-    if (RelocStr == "lo") {
-      short Val = MCE->getValue();
-      Res = MCConstantExpr::Create(Val, getContext());
-    } else if (RelocStr == "hi") {
-      int Val = MCE->getValue();
-      int LoSign = Val & 0x8000;
-      Val = (Val & 0xffff0000) >> 16;
-      // Lower part is treated as a signed int, so if it is negative
-      // we must add 1 to the hi part to compensate.
-      if (LoSign)
-        Val++;
-      Res = MCConstantExpr::Create(Val, getContext());
-    } else {
-      llvm_unreachable("Invalid RelocStr value");
+    // It's a constant, evaluate reloc value.
+    int16_t Val;
+    switch (getVariantKind(RelocStr)) {
+    case MCSymbolRefExpr::VK_Mips_ABS_LO:
+      // Get the 1st 16-bits.
+      Val = MCE->getValue() & 0xffff;
+      break;
+    case MCSymbolRefExpr::VK_Mips_ABS_HI:
+      // Get the 2nd 16-bits. Also add 1 if bit 15 is 1, to compensate for low
+      // 16 bits being negative.
+      Val = ((MCE->getValue() + 0x8000) >> 16) & 0xffff;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHER:
+      // Get the 3rd 16-bits.
+      Val = ((MCE->getValue() + 0x80008000LL) >> 32) & 0xffff;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHEST:
+      // Get the 4th 16-bits.
+      Val = ((MCE->getValue() + 0x800080008000LL) >> 48) & 0xffff;
+      break;
+    default:
+      report_fatal_error("Unsupported reloc value!");
     }
-    return Res;
+    return MCConstantExpr::Create(Val, getContext());
   }
 
   if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(Expr)) {
@@ -1268,6 +1483,12 @@ const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
   }
 
   if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr);
+
+    // Try to create target expression.
+    if (MipsMCExpr::isSupportedBinaryExpr(VK, BE))
+      return MipsMCExpr::Create(VK, Expr, getContext());
+
     const MCExpr *LExp = evaluateRelocExpr(BE->getLHS(), RelocStr);
     const MCExpr *RExp = evaluateRelocExpr(BE->getRHS(), RelocStr);
     Res = MCBinaryExpr::Create(BE->getOpcode(), LExp, RExp, getContext());
@@ -1298,8 +1519,8 @@ bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
     }
   case MCExpr::Unary:
     return isEvaluated(cast<MCUnaryExpr>(Expr)->getSubExpr());
-  default:
-    return false;
+  case MCExpr::Target:
+    return true;
   }
   return false;
 }
@@ -1348,9 +1569,27 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
 
 bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                   SMLoc &EndLoc) {
-  StartLoc = Parser.getTok().getLoc();
-  RegNo = tryParseRegister(isMips64());
-  EndLoc = Parser.getTok().getLoc();
+  SmallVector<MCParsedAsmOperand *, 1> Operands;
+  OperandMatchResultTy ResTy = ParseAnyRegister(Operands);
+  if (ResTy == MatchOperand_Success) {
+    assert(Operands.size() == 1);
+    MipsOperand &Operand = *static_cast<MipsOperand *>(Operands.front());
+    StartLoc = Operand.getStartLoc();
+    EndLoc = Operand.getEndLoc();
+
+    // AFAIK, we only support numeric registers and named GPR's in CFI
+    // directives.
+    // Don't worry about eating tokens before failing. Using an unrecognised
+    // register is a parse error.
+    if (Operand.isGPRAsmReg()) {
+      // Resolve to GPR32 or GPR64 appropriately.
+      RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
+    }
+
+    return (RegNo == (unsigned)-1);
+  }
+
+  assert(Operands.size() == 0);
   return (RegNo == (unsigned)-1);
 }
 
@@ -1384,7 +1623,7 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 
 MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-
+  DEBUG(dbgs() << "parseMemOperand\n");
   const MCExpr *IdVal = 0;
   SMLoc S;
   bool isParenExpr = false;
@@ -1407,7 +1646,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
       if (Mnemonic->getToken() == "la") {
         SMLoc E =
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-        Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+        Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
         return MatchOperand_Success;
       }
       if (Tok.is(AsmToken::EndOfStatement)) {
@@ -1415,8 +1654,9 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
         // Zero register assumed, add a memory operand with ZERO as its base.
-        Operands.push_back(MipsOperand::CreateMem(
-            isMips64() ? Mips::ZERO_64 : Mips::ZERO, IdVal, S, E));
+        MipsOperand *Base = MipsOperand::CreateGPRReg(
+            0, getContext().getRegisterInfo(), S, E, *this);
+        Operands.push_back(MipsOperand::CreateMem(Base, IdVal, S, E, *this));
         return MatchOperand_Success;
       }
       Error(Parser.getTok().getLoc(), "'(' expected");
@@ -1426,8 +1666,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     Parser.Lex(); // Eat the '(' token.
   }
 
-  Res = parseRegs(Operands, isMips64() ? (int)MipsOperand::Kind_GPR64
-                                       : (int)MipsOperand::Kind_GPR32);
+  Res = ParseAnyRegister(Operands);
   if (Res != MatchOperand_Success)
     return Res;
 
@@ -1445,7 +1684,6 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
   // Replace the register operand with the memory operand.
   MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
-  int RegNo = op->getReg();
   // Remove the register from the operands.
   Operands.pop_back();
   // Add the memory operand.
@@ -1458,599 +1696,193 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
                                    getContext());
   }
 
-  Operands.push_back(MipsOperand::CreateMem(RegNo, IdVal, S, E));
-  delete op;
+  Operands.push_back(MipsOperand::CreateMem(op, IdVal, S, E, *this));
   return MatchOperand_Success;
 }
 
-bool MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                                int RegKind) {
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return false;
-
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex();
-  AsmToken::TokenKind TkKind = getLexer().getKind();
-  int Reg;
+bool MipsAsmParser::searchSymbolAlias(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
-  if (TkKind == AsmToken::Integer) {
-    Reg = matchRegisterByNumber(Parser.getTok().getIntVal(),
-                                regKindToRegClass(RegKind));
-    if (Reg == -1)
-      return false;
-  } else if (TkKind == AsmToken::Identifier) {
-    if ((Reg = matchCPURegisterName(Parser.getTok().getString().lower())) == -1)
+  MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
+  if (Sym) {
+    SMLoc S = Parser.getTok().getLoc();
+    const MCExpr *Expr;
+    if (Sym->isVariable())
+      Expr = Sym->getVariableValue();
+    else
       return false;
-    Reg = getReg(regKindToRegClass(RegKind), Reg);
-  } else {
-    return false;
+    if (Expr->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+      const StringRef DefSymbol = Ref->getSymbol().getName();
+      if (DefSymbol.startswith("$")) {
+        OperandMatchResultTy ResTy =
+            MatchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
+        if (ResTy == MatchOperand_Success) {
+          Parser.Lex();
+          return true;
+        } else if (ResTy == MatchOperand_ParseFail)
+          llvm_unreachable("Should never ParseFail");
+        return false;
+      }
+    } else if (Expr->getKind() == MCExpr::Constant) {
+      Parser.Lex();
+      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
+      MipsOperand *op =
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this);
+      Operands.push_back(op);
+      return true;
+    }
   }
-
-  MipsOperand *Op = MipsOperand::CreatePtrReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind((MipsOperand::RegisterKind)RegKind);
-  Operands.push_back(Op);
-  Parser.Lex();
-  return true;
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  MipsOperand::RegisterKind RegKind =
-      isN64() ? MipsOperand::Kind_GPR64 : MipsOperand::Kind_GPR32;
-
-  // Parse index register.
-  if (!parsePtrReg(Operands, RegKind))
-    return MatchOperand_NoMatch;
-
-  // Parse '('.
-  if (Parser.getTok().isNot(AsmToken::LParen))
-    return MatchOperand_NoMatch;
-
-  Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
-  Parser.Lex();
-
-  // Parse base register.
-  if (!parsePtrReg(Operands, RegKind))
-    return MatchOperand_NoMatch;
-
-  // Parse ')'.
-  if (Parser.getTok().isNot(AsmToken::RParen))
-    return MatchOperand_NoMatch;
-
-  Operands.push_back(MipsOperand::CreateToken(")", getLexer().getLoc()));
-  Parser.Lex();
-
-  return MatchOperand_Success;
+  return false;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                         int RegKind) {
-  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
-  if (getLexer().getKind() == AsmToken::Identifier && !hasConsumedDollar) {
-    if (searchSymbolAlias(Operands, Kind))
-      return MatchOperand_Success;
-    return MatchOperand_NoMatch;
-  }
-  SMLoc S = Parser.getTok().getLoc();
-  // If the first token is not '$', we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar) && !hasConsumedDollar)
-    return MatchOperand_NoMatch;
-  if (!hasConsumedDollar) {
-    Parser.Lex(); // Eat the '$'
-    hasConsumedDollar = true;
-  }
-  if (getLexer().getKind() == AsmToken::Identifier) {
-    int RegNum = -1;
-    std::string RegName = Parser.getTok().getString().lower();
-    // Match register by name
-    switch (RegKind) {
-    case MipsOperand::Kind_GPR32:
-    case MipsOperand::Kind_GPR64:
-      RegNum = matchCPURegisterName(RegName);
-      break;
-    case MipsOperand::Kind_AFGR64Regs:
-    case MipsOperand::Kind_FGR64Regs:
-    case MipsOperand::Kind_FGR32Regs:
-    case MipsOperand::Kind_FGRH32Regs:
-      RegNum = matchFPURegisterName(RegName);
-      if (RegKind == MipsOperand::Kind_AFGR64Regs)
-        RegNum /= 2;
-      else if (RegKind == MipsOperand::Kind_FGRH32Regs && !isFP64())
-        if (RegNum != -1 && RegNum % 2 != 0)
-          Warning(S, "Float register should be even.");
-      break;
-    case MipsOperand::Kind_FCCRegs:
-      RegNum = matchFCCRegisterName(RegName);
-      break;
-    case MipsOperand::Kind_ACC64DSP:
-      RegNum = matchACRegisterName(RegName);
-      break;
-    default:
-      break; // No match, value is set to -1.
-    }
-    // No match found, return _NoMatch to give a chance to other round.
-    if (RegNum < 0)
-      return MatchOperand_NoMatch;
-
-    int RegVal = getReg(regKindToRegClass(Kind), RegNum);
-    if (RegVal == -1)
-      return MatchOperand_NoMatch;
-
-    MipsOperand *Op =
-        MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
-    Op->setRegKind(Kind);
-    Operands.push_back(Op);
-    hasConsumedDollar = false;
-    Parser.Lex(); // Eat the register name.
-    return MatchOperand_Success;
-  } else if (getLexer().getKind() == AsmToken::Integer) {
-    unsigned RegNum = Parser.getTok().getIntVal();
-    if (Kind == MipsOperand::Kind_HWRegs) {
-      if (RegNum != 29)
-        return MatchOperand_NoMatch;
-      // Only hwreg 29 is supported, found at index 0.
-      RegNum = 0;
-    }
-    int Reg = matchRegisterByNumber(RegNum, regKindToRegClass(Kind));
-    if (Reg == -1)
-      return MatchOperand_NoMatch;
-    MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-    Op->setRegKind(Kind);
-    Operands.push_back(Op);
-    hasConsumedDollar = false;
-    Parser.Lex(); // Eat the register number.
-    if ((RegKind == MipsOperand::Kind_GPR32) &&
-        (getLexer().is(AsmToken::LParen))) {
-      // Check if it is indexed addressing operand.
-      Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
-      Parser.Lex(); // Eat the parenthesis.
-      if (parseRegs(Operands, RegKind) != MatchOperand_Success)
-        return MatchOperand_NoMatch;
-      if (getLexer().isNot(AsmToken::RParen))
-        return MatchOperand_NoMatch;
-      Operands.push_back(MipsOperand::CreateToken(")", getLexer().getLoc()));
-      Parser.Lex();
-    }
+MipsAsmParser::MatchAnyRegisterNameWithoutDollar(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier,
+    SMLoc S) {
+  int Index = matchCPURegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateGPRReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
-  return MatchOperand_NoMatch;
-}
-
-bool MipsAsmParser::validateMSAIndex(int Val, int RegKind) {
-  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
 
-  if (Val < 0)
-    return false;
-
-  switch (Kind) {
-  default:
-    return false;
-  case MipsOperand::Kind_MSA128BRegs:
-    return Val < 16;
-  case MipsOperand::Kind_MSA128HRegs:
-    return Val < 8;
-  case MipsOperand::Kind_MSA128WRegs:
-    return Val < 4;
-  case MipsOperand::Kind_MSA128DRegs:
-    return Val < 2;
+  Index = matchFPURegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateFGRReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
   }
-}
 
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                            int RegKind) {
-  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
-  SMLoc S = Parser.getTok().getLoc();
-  std::string RegName;
-
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_NoMatch;
-
-  switch (RegKind) {
-  default:
-    return MatchOperand_ParseFail;
-  case MipsOperand::Kind_MSA128BRegs:
-  case MipsOperand::Kind_MSA128HRegs:
-  case MipsOperand::Kind_MSA128WRegs:
-  case MipsOperand::Kind_MSA128DRegs:
-    break;
+  Index = matchFCCRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateFCCReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
   }
 
-  Parser.Lex(); // Eat the '$'.
-  if (getLexer().getKind() == AsmToken::Identifier)
-    RegName = Parser.getTok().getString().lower();
-  else
-    return MatchOperand_ParseFail;
-
-  int RegNum = matchMSA128RegisterName(RegName);
-
-  if (RegNum < 0 || RegNum > 31)
-    return MatchOperand_ParseFail;
-
-  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
-  if (RegVal == -1)
-    return MatchOperand_ParseFail;
-
-  MipsOperand *Op = MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
-  Op->setRegKind(Kind);
-  Operands.push_back(Op);
-
-  Parser.Lex(); // Eat the register identifier.
-
-  // MSA registers may be suffixed with an index in the form of:
-  // 1) Immediate expression.
-  // 2) General Purpose Register.
-  // Examples:
-  //   1) copy_s.b $29,$w0[0]
-  //   2) sld.b $w0,$w1[$1]
-
-  if (Parser.getTok().isNot(AsmToken::LBrac))
+  Index = matchACRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateACCReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
+  }
 
-  MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
-
-  Operands.push_back(MipsOperand::CreateToken("[", Parser.getTok().getLoc()));
-  Parser.Lex(); // Parse the '[' token.
-
-  if (Parser.getTok().is(AsmToken::Dollar)) {
-    // This must be a GPR.
-    MipsOperand *RegOp;
-    SMLoc VIdx = Parser.getTok().getLoc();
-    Parser.Lex(); // Parse the '$' token.
-
-    // GPR have aliases and we must account for that. Example: $30 == $fp
-    if (getLexer().getKind() == AsmToken::Integer) {
-      unsigned RegNum = Parser.getTok().getIntVal();
-      int Reg = matchRegisterByNumber(
-          RegNum, regKindToRegClass(MipsOperand::Kind_GPR32));
-      if (Reg == -1) {
-        Error(VIdx, "invalid general purpose register");
-        return MatchOperand_ParseFail;
-      }
-
-      RegOp = MipsOperand::CreateReg(Reg, VIdx, Parser.getTok().getLoc());
-    } else if (getLexer().getKind() == AsmToken::Identifier) {
-      int RegNum = -1;
-      std::string RegName = Parser.getTok().getString().lower();
-
-      RegNum = matchCPURegisterName(RegName);
-      if (RegNum == -1) {
-        Error(VIdx, "general purpose register expected");
-        return MatchOperand_ParseFail;
-      }
-      RegNum = getReg(regKindToRegClass(MipsOperand::Kind_GPR32), RegNum);
-      RegOp = MipsOperand::CreateReg(RegNum, VIdx, Parser.getTok().getLoc());
-    } else
-      return MatchOperand_ParseFail;
-
-    RegOp->setRegKind(MipsOperand::Kind_GPR32);
-    Operands.push_back(RegOp);
-    Parser.Lex(); // Eat the register identifier.
-
-    if (Parser.getTok().isNot(AsmToken::RBrac))
-      return MatchOperand_ParseFail;
-
-    Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
-    Parser.Lex(); // Parse the ']' token.
-
+  Index = matchMSA128RegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateMSA128Reg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
 
-  // The index must be a constant expression then.
-  SMLoc VIdx = Parser.getTok().getLoc();
-  const MCExpr *ImmVal;
-
-  if (getParser().parseExpression(ImmVal))
-    return MatchOperand_ParseFail;
-
-  const MCConstantExpr *expr = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!expr || !validateMSAIndex((int)expr->getValue(), Kind)) {
-    Error(VIdx, "invalid immediate value");
-    return MatchOperand_ParseFail;
+  Index = matchMSA128CtrlRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::CreateMSACtrlReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
   }
 
-  SMLoc E = Parser.getTok().getEndLoc();
-
-  if (Parser.getTok().isNot(AsmToken::RBrac))
-    return MatchOperand_ParseFail;
-
-  bool insve =
-      Mnemonic->getToken() == "insve.b" || Mnemonic->getToken() == "insve.h" ||
-      Mnemonic->getToken() == "insve.w" || Mnemonic->getToken() == "insve.d";
-
-  // The second vector index of insve instructions is always 0.
-  if (insve && Operands.size() > 6) {
-    if (expr->getValue() != 0) {
-      Error(VIdx, "immediate value must be 0");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(MipsOperand::CreateToken("0", VIdx));
-  } else
-    Operands.push_back(MipsOperand::CreateImm(expr, VIdx, E));
-
-  Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
-
-  Parser.Lex(); // Parse the ']' token.
-
-  return MatchOperand_Success;
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                                int RegKind) {
-  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
-
-  if (Kind != MipsOperand::Kind_MSA128CtrlRegs)
-    return MatchOperand_NoMatch;
-
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_ParseFail;
-
-  SMLoc S = Parser.getTok().getLoc();
-
-  Parser.Lex(); // Eat the '$' symbol.
-
-  int RegNum = -1;
-  if (getLexer().getKind() == AsmToken::Identifier)
-    RegNum = matchMSA128CtrlRegisterName(Parser.getTok().getString().lower());
-  else if (getLexer().getKind() == AsmToken::Integer)
-    RegNum = Parser.getTok().getIntVal();
-  else
-    return MatchOperand_ParseFail;
-
-  if (RegNum < 0 || RegNum > 7)
-    return MatchOperand_ParseFail;
-
-  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
-  if (RegVal == -1)
-    return MatchOperand_ParseFail;
-
-  MipsOperand *RegOp =
-      MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
-  RegOp->setRegKind(MipsOperand::Kind_MSA128CtrlRegs);
-  Operands.push_back(RegOp);
-  Parser.Lex(); // Eat the register identifier.
-
-  return MatchOperand_Success;
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-
-  if (!isMips64())
-    return MatchOperand_NoMatch;
-  return parseRegs(Operands, (int)MipsOperand::Kind_GPR64);
+  return MatchOperand_NoMatch;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_GPR32);
-}
-
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseAFGR64Regs(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-
-  if (isFP64())
-    return MatchOperand_NoMatch;
-  return parseRegs(Operands, (int)MipsOperand::Kind_AFGR64Regs);
-}
+MipsAsmParser::MatchAnyRegisterWithoutDollar(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, SMLoc S) {
+  auto Token = Parser.getLexer().peekTok(false);
+
+  if (Token.is(AsmToken::Identifier)) {
+    DEBUG(dbgs() << ".. identifier\n");
+    StringRef Identifier = Token.getIdentifier();
+    OperandMatchResultTy ResTy =
+        MatchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
+    return ResTy;
+  } else if (Token.is(AsmToken::Integer)) {
+    DEBUG(dbgs() << ".. integer\n");
+    Operands.push_back(MipsOperand::CreateNumericReg(
+        Token.getIntVal(), getContext().getRegisterInfo(), S, Token.getLoc(),
+        *this));
+    return MatchOperand_Success;
+  }
 
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  if (!isFP64())
-    return MatchOperand_NoMatch;
-  return parseRegs(Operands, (int)MipsOperand::Kind_FGR64Regs);
-}
+  DEBUG(dbgs() << Parser.getTok().getKind() << "\n");
 
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_FGR32Regs);
+  return MatchOperand_NoMatch;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseFGRH32Regs(
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister(
     SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_FGRH32Regs);
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_FCCRegs);
-}
+  DEBUG(dbgs() << "ParseAnyRegister\n");
 
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_ACC64DSP);
-}
+  auto Token = Parser.getTok();
 
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_NoMatch;
+  SMLoc S = Token.getLoc();
 
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'
-
-  const AsmToken &Tok = Parser.getTok(); // Get next token.
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  if (!Tok.getIdentifier().startswith("ac"))
-    return MatchOperand_NoMatch;
-
-  StringRef NumString = Tok.getIdentifier().substr(2);
-
-  unsigned IntVal;
-  if (NumString.getAsInteger(10, IntVal))
-    return MatchOperand_NoMatch;
-
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::LO32DSPRegClassID);
-
-  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_LO32DSP);
-  Operands.push_back(Op);
-
-  Parser.Lex(); // Eat the register number.
-  return MatchOperand_Success;
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_NoMatch;
-
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'
-
-  const AsmToken &Tok = Parser.getTok(); // Get next token.
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  if (!Tok.getIdentifier().startswith("ac"))
-    return MatchOperand_NoMatch;
-
-  StringRef NumString = Tok.getIdentifier().substr(2);
-
-  unsigned IntVal;
-  if (NumString.getAsInteger(10, IntVal))
+  if (Token.isNot(AsmToken::Dollar)) {
+    DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
+    if (Token.is(AsmToken::Identifier)) {
+      if (searchSymbolAlias(Operands))
+        return MatchOperand_Success;
+    }
+    DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
     return MatchOperand_NoMatch;
+  }
+  DEBUG(dbgs() << ".. $\n");
 
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::HI32DSPRegClassID);
-
-  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_HI32DSP);
-  Operands.push_back(Op);
-
-  Parser.Lex(); // Eat the register number.
-  return MatchOperand_Success;
+  OperandMatchResultTy ResTy = MatchAnyRegisterWithoutDollar(Operands, S);
+  if (ResTy == MatchOperand_Success) {
+    Parser.Lex(); // $
+    Parser.Lex(); // identifier
+  }
+  return ResTy;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  switch (getLexer().getKind()) {
+  default:
     return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::String:
+    break;
+  }
 
+  const MCExpr *IdVal;
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'
-
-  const AsmToken &Tok = Parser.getTok(); // Get next token.
-
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
-
-  unsigned IntVal = Tok.getIntVal();
-
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::COP2RegClassID);
-
-  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_COP2);
-  Operands.push_back(Op);
+  if (getParser().parseExpression(IdVal))
+    return MatchOperand_ParseFail;
 
-  Parser.Lex(); // Eat the register number.
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
   return MatchOperand_Success;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128BRegs(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128BRegs);
-}
-
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128HRegs(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128HRegs);
-}
-
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128WRegs(
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget(
     SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128WRegs);
-}
+  DEBUG(dbgs() << "ParseJumpTarget\n");
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128DRegs(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128DRegs);
-}
+  SMLoc S = getLexer().getLoc();
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128CtrlRegs(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseMSACtrlRegs(Operands, (int)MipsOperand::Kind_MSA128CtrlRegs);
-}
+  // Integers and expressions are acceptable
+  OperandMatchResultTy ResTy = ParseImm(Operands);
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
 
-bool MipsAsmParser::searchSymbolAlias(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, unsigned RegKind) {
+  // Registers are a valid target and have priority over symbols.
+  ResTy = ParseAnyRegister(Operands);
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
 
-  MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
-  if (Sym) {
-    SMLoc S = Parser.getTok().getLoc();
-    const MCExpr *Expr;
-    if (Sym->isVariable())
-      Expr = Sym->getVariableValue();
-    else
-      return false;
-    if (Expr->getKind() == MCExpr::SymbolRef) {
-      MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
-      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
-      const StringRef DefSymbol = Ref->getSymbol().getName();
-      if (DefSymbol.startswith("$")) {
-        int RegNum = -1;
-        APInt IntVal(32, -1);
-        if (!DefSymbol.substr(1).getAsInteger(10, IntVal))
-          RegNum = matchRegisterByNumber(IntVal.getZExtValue(),
-                                         isMips64() ? Mips::GPR64RegClassID
-                                                    : Mips::GPR32RegClassID);
-        else {
-          // Lookup for the register with the corresponding name.
-          switch (Kind) {
-          case MipsOperand::Kind_AFGR64Regs:
-          case MipsOperand::Kind_FGR64Regs:
-            RegNum = matchFPURegisterName(DefSymbol.substr(1));
-            break;
-          case MipsOperand::Kind_FGR32Regs:
-            RegNum = matchFPURegisterName(DefSymbol.substr(1));
-            break;
-          case MipsOperand::Kind_GPR64:
-          case MipsOperand::Kind_GPR32:
-          default:
-            RegNum = matchCPURegisterName(DefSymbol.substr(1));
-            break;
-          }
-          if (RegNum > -1)
-            RegNum = getReg(regKindToRegClass(Kind), RegNum);
-        }
-        if (RegNum > -1) {
-          Parser.Lex();
-          MipsOperand *op =
-              MipsOperand::CreateReg(RegNum, S, Parser.getTok().getLoc());
-          op->setRegKind(Kind);
-          Operands.push_back(op);
-          return true;
-        }
-      }
-    } else if (Expr->getKind() == MCExpr::Constant) {
-      Parser.Lex();
-      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
-      MipsOperand *op =
-          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc());
-      Operands.push_back(op);
-      return true;
-    }
+  const MCExpr *Expr = nullptr;
+  if (Parser.parseExpression(Expr)) {
+    // We have no way of knowing if a symbol was consumed so we must ParseFail
+    return MatchOperand_ParseFail;
   }
-  return false;
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_HWRegs);
-}
-
-MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  return parseRegs(Operands, (int)MipsOperand::Kind_CCRRegs);
+  Operands.push_back(
+      MipsOperand::CreateImm(Expr, S, getLexer().getLoc(), *this));
+  return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
@@ -2067,12 +1899,12 @@ MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   int64_t Val = MCE->getValue();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   Operands.push_back(MipsOperand::CreateImm(
-      MCConstantExpr::Create(0 - Val, getContext()), S, E));
+      MCConstantExpr::Create(0 - Val, getContext()), S, E, *this));
   return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -2105,8 +1937,8 @@ MipsAsmParser::parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  Operands.push_back(MipsOperand::CreateLSAImm(Expr, S,
-                                               Parser.getTok().getLoc()));
+  Operands.push_back(
+      MipsOperand::CreateImm(Expr, S, Parser.getTok().getLoc(), *this));
   return MatchOperand_Success;
 }
 
@@ -2131,21 +1963,87 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
           .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST)
           .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI)
           .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+          .Case("got_hi", MCSymbolRefExpr::VK_Mips_GOT_HI16)
+          .Case("got_lo", MCSymbolRefExpr::VK_Mips_GOT_LO16)
+          .Case("call_hi", MCSymbolRefExpr::VK_Mips_CALL_HI16)
+          .Case("call_lo", MCSymbolRefExpr::VK_Mips_CALL_LO16)
+          .Case("higher", MCSymbolRefExpr::VK_Mips_HIGHER)
+          .Case("highest", MCSymbolRefExpr::VK_Mips_HIGHEST)
           .Default(MCSymbolRefExpr::VK_None);
 
+  assert (VK != MCSymbolRefExpr::VK_None);
+
   return VK;
 }
 
+/// Sometimes (i.e. load/stores) the operand may be followed immediately by
+/// either this.
+/// ::= '(', register, ')'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::ParseParenSuffix(
+    StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  if (getLexer().is(AsmToken::LParen)) {
+    Operands.push_back(
+        MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
+    Parser.Lex();
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (Parser.getTok().isNot(AsmToken::RParen)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token, expected ')'");
+    }
+    Operands.push_back(
+        MipsOperand::CreateToken(")", getLexer().getLoc(), *this));
+    Parser.Lex();
+  }
+  return false;
+}
+
+/// Sometimes (i.e. in MSA) the operand may be followed immediately by
+/// either one of these.
+/// ::= '[', register, ']'
+/// ::= '[', integer, ']'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::ParseBracketSuffix(
+    StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  if (getLexer().is(AsmToken::LBrac)) {
+    Operands.push_back(
+        MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
+    Parser.Lex();
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token, expected ']'");
+    }
+    Operands.push_back(
+        MipsOperand::CreateToken("]", getLexer().getLoc(), *this));
+    Parser.Lex();
+  }
+  return false;
+}
+
 bool MipsAsmParser::ParseInstruction(
     ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
     SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  DEBUG(dbgs() << "ParseInstruction\n");
   // Check if we have valid mnemonic
   if (!mnemonicIsValid(Name, 0)) {
     Parser.eatToEndOfStatement();
     return Error(NameLoc, "Unknown instruction");
   }
   // First operand in MCInst is instruction mnemonic.
-  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this));
 
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
@@ -2155,6 +2053,9 @@ bool MipsAsmParser::ParseInstruction(
       Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
     }
+    if (getLexer().is(AsmToken::LBrac) && ParseBracketSuffix(Name, Operands))
+      return true;
+    // AFAIK, parenthesis suffixes are never on the first operand
 
     while (getLexer().is(AsmToken::Comma)) {
       Parser.Lex(); // Eat the comma.
@@ -2164,6 +2065,13 @@ bool MipsAsmParser::ParseInstruction(
         Parser.eatToEndOfStatement();
         return Error(Loc, "unexpected token in argument list");
       }
+      // Parse bracket and parenthesis suffixes before we iterate
+      if (getLexer().is(AsmToken::LBrac)) {
+        if (ParseBracketSuffix(Name, Operands))
+          return true;
+      } else if (getLexer().is(AsmToken::LParen) &&
+                 ParseParenSuffix(Name, Operands))
+        return true;
     }
   }
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
@@ -2222,7 +2130,7 @@ bool MipsAsmParser::parseSetAtDirective() {
       return false;
     }
 
-    if (AtRegNo < 1 || AtRegNo > 31) {
+    if (AtRegNo < 0 || AtRegNo > 31) {
       reportParseError("unexpected token in statement");
       return false;
     }
@@ -2253,6 +2161,7 @@ bool MipsAsmParser::parseSetReorderDirective() {
     return false;
   }
   Options.setReorder();
+  getTargetStreamer().emitDirectiveSetReorder();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
@@ -2265,6 +2174,7 @@ bool MipsAsmParser::parseSetNoReorderDirective() {
     return false;
   }
   Options.setNoreorder();
+  getTargetStreamer().emitDirectiveSetNoReorder();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
@@ -2297,6 +2207,18 @@ bool MipsAsmParser::parseSetNoMacroDirective() {
   return false;
 }
 
+bool MipsAsmParser::parseSetNoMips16Directive() {
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  // For now do nothing.
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
 bool MipsAsmParser::parseSetAssignment() {
   StringRef Name;
   const MCExpr *Value;
@@ -2308,22 +2230,7 @@ bool MipsAsmParser::parseSetAssignment() {
     return reportParseError("unexpected token in .set directive");
   Lex(); // Eat comma
 
-  if (getLexer().is(AsmToken::Dollar)) {
-    MCSymbol *Symbol;
-    SMLoc DollarLoc = getLexer().getLoc();
-    // Consume the dollar sign, and check for a following identifier.
-    Parser.Lex();
-    // We have a '$' followed by something, make sure they are adjacent.
-    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
-      return true;
-    StringRef Res =
-        StringRef(DollarLoc.getPointer(),
-                  getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
-    Symbol = getContext().GetOrCreateSymbol(Res);
-    Parser.Lex();
-    Value =
-        MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, getContext());
-  } else if (Parser.parseExpression(Value))
+  if (Parser.parseExpression(Value))
     return reportParseError("expected valid expression after comma");
 
   // Check if the Name already exists as a symbol.
@@ -2336,6 +2243,156 @@ bool MipsAsmParser::parseSetAssignment() {
   return false;
 }
 
+bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token in .set directive");
+
+  switch(Feature) {
+    default: llvm_unreachable("Unimplemented feature");
+    case Mips::FeatureDSP:
+      setFeatureBits(Mips::FeatureDSP, "dsp");
+      getTargetStreamer().emitDirectiveSetDsp();
+    break;
+    case Mips::FeatureMicroMips:
+      getTargetStreamer().emitDirectiveSetMicroMips();
+    break;
+    case Mips::FeatureMips16:
+      getTargetStreamer().emitDirectiveSetMips16();
+    break;
+    case Mips::FeatureMips32r2:
+      setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
+      getTargetStreamer().emitDirectiveSetMips32R2();
+    break;
+    case Mips::FeatureMips64:
+      setFeatureBits(Mips::FeatureMips64, "mips64");
+      getTargetStreamer().emitDirectiveSetMips64();
+    break;
+    case Mips::FeatureMips64r2:
+      setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
+      getTargetStreamer().emitDirectiveSetMips64R2();
+    break;
+  }
+  return false;
+}
+
+bool MipsAsmParser::parseRegister(unsigned &RegNum) {
+  if (!getLexer().is(AsmToken::Dollar))
+    return false;
+
+  Parser.Lex();
+
+  const AsmToken &Reg = Parser.getTok();
+  if (Reg.is(AsmToken::Identifier)) {
+    RegNum = matchCPURegisterName(Reg.getIdentifier());
+  } else if (Reg.is(AsmToken::Integer)) {
+    RegNum = Reg.getIntVal();
+  } else {
+    return false;
+  }
+
+  Parser.Lex();
+  return true;
+}
+
+bool MipsAsmParser::eatComma(StringRef ErrorStr) {
+  if (getLexer().isNot(AsmToken::Comma)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, ErrorStr);
+  }
+
+  Parser.Lex();  // Eat the comma.
+  return true;
+}
+
+bool MipsAsmParser::parseDirectiveCPSetup() {
+  unsigned FuncReg;
+  unsigned Save;
+  bool SaveIsReg = true;
+
+  if (!parseRegister(FuncReg))
+    return reportParseError("expected register containing function address");
+  FuncReg = getGPR(FuncReg);
+
+  if (!eatComma("expected comma parsing directive"))
+    return true;
+
+  if (!parseRegister(Save)) {
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      Save = Tok.getIntVal();
+      SaveIsReg = false;
+      Parser.Lex();
+    } else
+      return reportParseError("expected save register or stack offset");
+  } else
+    Save = getGPR(Save);
+
+  if (!eatComma("expected comma parsing directive"))
+    return true;
+
+  StringRef Name;
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier");
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  unsigned GPReg = getGPR(matchCPURegisterName("gp"));
+
+  // FIXME: The code below this point should be in the TargetStreamers.
+  // Only N32 and N64 emit anything for .cpsetup
+  // FIXME: We should only emit something for PIC mode too.
+  if (!isN32() && !isN64())
+    return false;
+
+  MCStreamer &TS = getStreamer();
+  MCInst Inst;
+  // Either store the old $gp in a register or on the stack
+  if (SaveIsReg) {
+    // move $save, $gpreg
+    Inst.setOpcode(Mips::DADDu);
+    Inst.addOperand(MCOperand::CreateReg(Save));
+    Inst.addOperand(MCOperand::CreateReg(GPReg));
+    Inst.addOperand(MCOperand::CreateReg(getGPR(0)));
+  } else {
+    // sd $gpreg, offset($sp)
+    Inst.setOpcode(Mips::SD);
+    Inst.addOperand(MCOperand::CreateReg(GPReg));
+    Inst.addOperand(MCOperand::CreateReg(getGPR(matchCPURegisterName("sp"))));
+    Inst.addOperand(MCOperand::CreateImm(Save));
+  }
+  TS.EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+      Sym->getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI,
+      getContext());
+  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+      Sym->getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO,
+      getContext());
+  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::LUi);
+  Inst.addOperand(MCOperand::CreateReg(GPReg));
+  Inst.addOperand(MCOperand::CreateExpr(HiExpr));
+  TS.EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::ADDiu);
+  Inst.addOperand(MCOperand::CreateReg(GPReg));
+  Inst.addOperand(MCOperand::CreateReg(GPReg));
+  Inst.addOperand(MCOperand::CreateExpr(LoExpr));
+  TS.EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // daddu  $gp, $gp, $funcreg
+  Inst.setOpcode(Mips::DADDu);
+  Inst.addOperand(MCOperand::CreateReg(GPReg));
+  Inst.addOperand(MCOperand::CreateReg(GPReg));
+  Inst.addOperand(MCOperand::CreateReg(FuncReg));
+  TS.EmitInstruction(Inst, STI);
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveSet() {
 
   // Get the next token.
@@ -2353,14 +2410,24 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetMacroDirective();
   } else if (Tok.getString() == "nomacro") {
     return parseSetNoMacroDirective();
+  } else if (Tok.getString() == "mips16") {
+    return parseSetFeature(Mips::FeatureMips16);
   } else if (Tok.getString() == "nomips16") {
-    // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
-    return false;
+    return parseSetNoMips16Directive();
   } else if (Tok.getString() == "nomicromips") {
-    // Ignore this directive for now.
+    getTargetStreamer().emitDirectiveSetNoMicroMips();
     Parser.eatToEndOfStatement();
     return false;
+  } else if (Tok.getString() == "micromips") {
+      return parseSetFeature(Mips::FeatureMicroMips);
+  } else if (Tok.getString() == "mips32r2") {
+      return parseSetFeature(Mips::FeatureMips32r2);
+  } else if (Tok.getString() == "mips64") {
+      return parseSetFeature(Mips::FeatureMips64);
+  } else if (Tok.getString() == "mips64r2") {
+      return parseSetFeature(Mips::FeatureMips64r2);
+  } else if (Tok.getString() == "dsp") {
+      return parseSetFeature(Mips::FeatureDSP);
   } else {
     // It is just an identifier, look for an assignment.
     parseSetAssignment();
@@ -2370,37 +2437,9 @@ bool MipsAsmParser::parseDirectiveSet() {
   return true;
 }
 
-bool MipsAsmParser::parseDirectiveMipsHackStocg() {
-  MCAsmParser &Parser = getParser();
-  StringRef Name;
-  if (Parser.parseIdentifier(Name))
-    reportParseError("expected identifier");
-
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token");
-  Lex();
-
-  int64_t Flags = 0;
-  if (Parser.parseAbsoluteExpression(Flags))
-    return TokError("unexpected token");
-
-  getTargetStreamer().emitMipsHackSTOCG(Sym, Flags);
-  return false;
-}
-
-bool MipsAsmParser::parseDirectiveMipsHackELFFlags() {
-  int64_t Flags = 0;
-  if (Parser.parseAbsoluteExpression(Flags))
-    return TokError("unexpected token");
-
-  getTargetStreamer().emitMipsHackELFFlags(Flags);
-  return false;
-}
-
-/// parseDirectiveWord
+/// parseDataDirective
 ///  ::= .word [ expression (, expression)* ]
-bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -2439,10 +2478,70 @@ bool MipsAsmParser::parseDirectiveGpWord() {
   return false;
 }
 
-bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+/// parseDirectiveGpDWord
+///  ::= .gpdword local_sym
+bool MipsAsmParser::parseDirectiveGpDWord() {
+  const MCExpr *Value;
+  // EmitGPRel64Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel64Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), "unexpected token in directive");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveOption() {
+  // Get the option token.
+  AsmToken Tok = Parser.getTok();
+  // At the moment only identifiers are supported.
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in .option directive");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  StringRef Option = Tok.getIdentifier();
 
+  if (Option == "pic0") {
+    getTargetStreamer().emitDirectiveOptionPic0();
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(),
+            "unexpected token in .option pic0 directive");
+      Parser.eatToEndOfStatement();
+    }
+    return false;
+  }
+
+  if (Option == "pic2") {
+    getTargetStreamer().emitDirectiveOptionPic2();
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(),
+            "unexpected token in .option pic2 directive");
+      Parser.eatToEndOfStatement();
+    }
+    return false;
+  }
+
+  // Unknown option.
+  Warning(Parser.getTok().getLoc(), "unknown option in .option directive");
+  Parser.eatToEndOfStatement();
+  return false;
+}
+
+bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
+  if (IDVal == ".dword") {
+    parseDataDirective(8, DirectiveID.getLoc());
+    return false;
+  }
+
   if (IDVal == ".ent") {
     // Ignore this directive for now.
     Parser.Lex();
@@ -2478,21 +2577,35 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   }
 
   if (IDVal == ".gpword") {
-    // Ignore this directive for now.
     parseDirectiveGpWord();
     return false;
   }
 
+  if (IDVal == ".gpdword") {
+    parseDirectiveGpDWord();
+    return false;
+  }
+
   if (IDVal == ".word") {
-    parseDirectiveWord(4, DirectiveID.getLoc());
+    parseDataDirective(4, DirectiveID.getLoc());
     return false;
   }
 
-  if (IDVal == ".mips_hack_stocg")
-    return parseDirectiveMipsHackStocg();
+  if (IDVal == ".option")
+    return parseDirectiveOption();
+
+  if (IDVal == ".abicalls") {
+    getTargetStreamer().emitDirectiveAbiCalls();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(), "unexpected token in directive");
+      // Clear line
+      Parser.eatToEndOfStatement();
+    }
+    return false;
+  }
 
-  if (IDVal == ".mips_hack_elf_flags")
-    return parseDirectiveMipsHackELFFlags();
+  if (IDVal == ".cpsetup")
+    return parseDirectiveCPSetup();
 
   return true;
 }
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 6acc9a8..c304ee3 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -16,6 +16,7 @@ add_public_tablegen_target(MipsCommonTableGen)
 add_llvm_target(MipsCodeGen
   Mips16FrameLowering.cpp
   Mips16HardFloat.cpp
+  Mips16HardFloatInfo.cpp
   Mips16InstrInfo.cpp
   Mips16ISelDAGToDAG.cpp
   Mips16ISelLowering.cpp
@@ -34,6 +35,7 @@ add_llvm_target(MipsCodeGen
   MipsMCInstLower.cpp
   MipsMachineFunction.cpp
   MipsModuleISelDAGToDAG.cpp
+  MipsOptimizePICCall.cpp
   MipsOs16.cpp
   MipsRegisterInfo.cpp
   MipsSEFrameLowering.cpp
@@ -47,8 +49,6 @@ add_llvm_target(MipsCodeGen
   MipsSelectionDAGInfo.cpp
   )
 
-add_dependencies(LLVMMipsCodeGen MipsCommonTableGen intrinsics_gen)
-
 add_subdirectory(InstPrinter)
 add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/Mips/Disassembler/Android.mk b/lib/Target/Mips/Disassembler/Android.mk
index 868b43a..20fd87a 100644
--- a/lib/Target/Mips/Disassembler/Android.mk
+++ b/lib/Target/Mips/Disassembler/Android.mk
@@ -11,6 +11,7 @@ mips_disassembler_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -26,6 +27,7 @@ TBLGEN_TD_DIR := $(LOCAL_PATH)/..
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/Mips/Disassembler/CMakeLists.txt b/lib/Target/Mips/Disassembler/CMakeLists.txt
index fe1dc75..a64d02c 100644
--- a/lib/Target/Mips/Disassembler/CMakeLists.txt
+++ b/lib/Target/Mips/Disassembler/CMakeLists.txt
@@ -1,15 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMipsDisassembler
   MipsDisassembler.cpp
   )
-
-# workaround for hanging compilation on MSVC9 and 10
-if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-set_property(
-  SOURCE MipsDisassembler.cpp
-  PROPERTY COMPILE_FLAGS "/Od"
-  )
-endif()
-
-add_dependencies(LLVMMipsDisassembler MipsCommonTableGen)
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 60508a8..fc3b922 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -263,6 +263,11 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
                                   uint64_t Address,
                                   const void *Decoder);
 
+/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
+/// handle.
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -304,9 +309,54 @@ extern "C" void LLVMInitializeMipsDisassembler() {
                                          createMips64elDisassembler);
 }
 
-
 #include "MipsGenDisassemblerTables.inc"
 
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder) {
+  typedef DecodeStatus (*DecodeFN)(MCInst &, unsigned, uint64_t, const void *);
+  // The size of the n field depends on the element size
+  // The register class also depends on this.
+  InsnType tmp = fieldFromInstruction(insn, 17, 5);
+  unsigned NSize = 0;
+  DecodeFN RegDecoder = nullptr;
+  if ((tmp & 0x18) == 0x00) { // INSVE_B
+    NSize = 4;
+    RegDecoder = DecodeMSA128BRegisterClass;
+  } else if ((tmp & 0x1c) == 0x10) { // INSVE_H
+    NSize = 3;
+    RegDecoder = DecodeMSA128HRegisterClass;
+  } else if ((tmp & 0x1e) == 0x18) { // INSVE_W
+    NSize = 2;
+    RegDecoder = DecodeMSA128WRegisterClass;
+  } else if ((tmp & 0x1f) == 0x1c) { // INSVE_D
+    NSize = 1;
+    RegDecoder = DecodeMSA128DRegisterClass;
+  } else
+    llvm_unreachable("Invalid encoding");
+
+  assert(NSize != 0 && RegDecoder != nullptr);
+
+  // $wd
+  tmp = fieldFromInstruction(insn, 6, 5);
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $wd_in
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $n
+  tmp = fieldFromInstruction(insn, 16, NSize);
+  MI.addOperand(MCOperand::CreateImm(tmp));
+  // $ws
+  tmp = fieldFromInstruction(insn, 11, 5);
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $n2
+  MI.addOperand(MCOperand::CreateImm(0));
+
+  return MCDisassembler::Success;
+}
+
   /// readInstruction - read four bytes from the MemoryObject
   /// and return 32 bit word sorted according to the given endianess
 static DecodeStatus readInstruction32(const MemoryObject &region,
@@ -565,7 +615,37 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
 
   Inst.addOperand(MCOperand::CreateReg(Reg));
   Inst.addOperand(MCOperand::CreateReg(Base));
-  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  // The immediate field of an LD/ST instruction is scaled which means it must
+  // be multiplied (when decoding) by the size (in bytes) of the instructions'
+  // data format.
+  // .b - 1 byte
+  // .h - 2 bytes
+  // .w - 4 bytes
+  // .d - 8 bytes
+  switch(Inst.getOpcode())
+  {
+  default:
+    assert (0 && "Unexpected instruction");
+    return MCDisassembler::Fail;
+    break;
+  case Mips::LD_B:
+  case Mips::ST_B:
+    Inst.addOperand(MCOperand::CreateImm(Offset));
+    break;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    Inst.addOperand(MCOperand::CreateImm(Offset << 1));
+    break;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+    break;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    Inst.addOperand(MCOperand::CreateImm(Offset << 3));
+    break;
+  }
 
   return MCDisassembler::Success;
 }
@@ -581,6 +661,9 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
   Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
+  if (Inst.getOpcode() == Mips::SC_MM)
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+
   Inst.addOperand(MCOperand::CreateReg(Reg));
   Inst.addOperand(MCOperand::CreateReg(Base));
   Inst.addOperand(MCOperand::CreateImm(Offset));
diff --git a/lib/Target/Mips/InstPrinter/Android.mk b/lib/Target/Mips/InstPrinter/Android.mk
index fc256cd..f4f3a4f 100644
--- a/lib/Target/Mips/InstPrinter/Android.mk
+++ b/lib/Target/Mips/InstPrinter/Android.mk
@@ -29,6 +29,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -44,3 +45,4 @@ LOCAL_C_INCLUDES += $(LOCAL_PATH)/..
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/Mips/InstPrinter/CMakeLists.txt b/lib/Target/Mips/InstPrinter/CMakeLists.txt
index 3e9fbf1..2a67fba 100644
--- a/lib/Target/Mips/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Mips/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMipsAsmPrinter
   MipsInstPrinter.cpp
   )
-
-add_dependencies(LLVMMipsAsmPrinter MipsCommonTableGen)
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 7884589..c8f08f1 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "MipsInstPrinter.h"
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "MipsInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
@@ -83,6 +84,27 @@ void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   case Mips::RDHWR64:
     O << "\t.set\tpush\n";
     O << "\t.set\tmips32r2\n";
+    break;
+  case Mips::Save16:
+    O << "\tsave\t";
+    printSaveRestore(MI, O);
+    O << " # 16 bit inst\n";
+    return;
+  case Mips::SaveX16:
+    O << "\tsave\t";
+    printSaveRestore(MI, O);
+    O << "\n";
+    return;
+  case Mips::Restore16:
+    O << "\trestore\t";
+    printSaveRestore(MI, O);
+    O << " # 16 bit inst\n";
+    return;
+  case Mips::RestoreX16:
+    O << "\trestore\t";
+    printSaveRestore(MI, O);
+    O << "\n";
+    return;
   }
 
   // Try to print any aliases first.
@@ -108,8 +130,10 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
     assert(SRE && CE && "Binary expression must be sym+const.");
     Offset = CE->getValue();
-  }
-  else if (!(SRE = dyn_cast<MCSymbolRefExpr>(Expr)))
+  } else if (const MipsMCExpr *ME = dyn_cast<MipsMCExpr>(Expr)) {
+    ME->print(OS);
+    return;
+  } else if (!(SRE = dyn_cast<MCSymbolRefExpr>(Expr)))
     assert(false && "Unexpected MCExpr type.");
 
   MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
@@ -286,3 +310,14 @@ bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
   default: return false;
   }
 }
+
+void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != 0) O << ", ";
+    if (MI->getOperand(i).isReg())
+      printRegName(O, MI->getOperand(i).getReg());
+    else
+      printUnsignedImm(MI, i, O);
+  }
+}
+
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index f75ae24..2b745f0 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -104,6 +104,7 @@ private:
   bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0,
                   unsigned OpNo1, raw_ostream &OS);
   bool printAlias(const MCInst &MI, raw_ostream &OS);
+  void printSaveRestore(const MCInst *MI, raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/lib/Target/Mips/LLVMBuild.txt b/lib/Target/Mips/LLVMBuild.txt
index a95d6bc..e6d3a42 100644
--- a/lib/Target/Mips/LLVMBuild.txt
+++ b/lib/Target/Mips/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = MipsCodeGen
 parent = Mips
-required_libraries = AsmPrinter CodeGen Core MC MipsAsmPrinter MipsDesc MipsInfo SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC MipsAsmPrinter MipsDesc MipsInfo Scalar SelectionDAG Support Target
 add_to_library_groups = Mips
diff --git a/lib/Target/Mips/MCTargetDesc/Android.mk b/lib/Target/Mips/MCTargetDesc/Android.mk
index 3da6944..7ee11a1 100644
--- a/lib/Target/Mips/MCTargetDesc/Android.mk
+++ b/lib/Target/Mips/MCTargetDesc/Android.mk
@@ -9,10 +9,12 @@ mips_mc_desc_TBLGEN_TABLES := \
 mips_mc_desc_SRC_FILES := \
   MipsAsmBackend.cpp \
   MipsELFObjectWriter.cpp \
+  MipsELFStreamer.cpp \
   MipsMCAsmInfo.cpp \
   MipsMCCodeEmitter.cpp \
+  MipsMCExpr.cpp \
   MipsMCTargetDesc.cpp \
-  MipsReginfo.cpp \
+  MipsNaClELFStreamer.cpp \
   MipsTargetStreamer.cpp
 
 # For the host
@@ -36,6 +38,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -52,3 +55,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 9116748..d3e2fd7 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_llvm_library(LLVMMipsDesc
   MipsAsmBackend.cpp
+  MipsELFObjectWriter.cpp
+  MipsELFStreamer.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
+  MipsMCExpr.cpp
   MipsMCTargetDesc.cpp
-  MipsELFObjectWriter.cpp
-  MipsReginfo.cpp
+  MipsNaClELFStreamer.cpp
   MipsTargetStreamer.cpp
   )
-
-add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 3e70b23..0f99ecc 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -1,4 +1,4 @@
-//===-- MipsASMBackend.cpp - Mips Asm Backend  ----------------------------===//
+//===-- MipsAsmBackend.cpp - Mips Asm Backend  ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,32 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the MipsAsmBackend and MipsELFObjectWriter classes.
+// This file implements the MipsAsmBackend class.
 //
 //===----------------------------------------------------------------------===//
 //
 
-#include "MipsFixupKinds.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsAsmBackend.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 // Prepare value for the target space for it
-static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx = NULL) {
+
+  unsigned Kind = Fixup.getKind();
 
   // Add/subtract and shift
   switch (Kind) {
   default:
     return 0;
+  case FK_Data_2:
   case FK_GPRel_4:
   case FK_Data_4:
   case FK_Data_8:
@@ -56,8 +63,11 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // so the displacement will be one instruction size less.
     Value -= 4;
     // The displacement is then divided by 4 to give us an 18 bit
-    // address range.
-    Value >>= 2;
+    // address range. Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 16-bit signed immediate.
+    if (!isIntN(16, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
   case Mips::fixup_Mips_26:
     // So far we are only using this type for jumps.
@@ -86,196 +96,196 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     break;
   case Mips::fixup_MICROMIPS_PC16_S1:
     Value -= 4;
-    Value >>= 1;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 16-bit signed immediate.
+    if (!isIntN(16, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
   }
 
   return Value;
 }
 
-namespace {
-class MipsAsmBackend : public MCAsmBackend {
-  Triple::OSType OSType;
-  bool IsLittle; // Big or little endian
-  bool Is64Bit;  // 32 or 64 bit words
-
-public:
-  MipsAsmBackend(const Target &T,  Triple::OSType _OSType,
-                 bool _isLittle, bool _is64Bit)
-    :MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), Is64Bit(_is64Bit) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createMipsELFObjectWriter(OS,
-      MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
-  }
-
-  /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
-  /// data fragment, at the offset specified by the fixup and following the
-  /// fixup kind as appropriate.
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const {
-    MCFixupKind Kind = Fixup.getKind();
-    Value = adjustFixupValue((unsigned)Kind, Value);
-
-    if (!Value)
-      return; // Doesn't change encoding.
-
-    // Where do we start in the object
-    unsigned Offset = Fixup.getOffset();
-    // Number of bytes we need to fixup
-    unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
-    // Used to point to big endian bytes
-    unsigned FullSize;
+MCObjectWriter *MipsAsmBackend::createObjectWriter(raw_ostream &OS) const {
+  return createMipsELFObjectWriter(OS,
+    MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
+}
 
-    switch ((unsigned)Kind) {
-    case Mips::fixup_Mips_16:
-      FullSize = 2;
-      break;
-    case Mips::fixup_Mips_64:
-      FullSize = 8;
-      break;
-    default:
-      FullSize = 4;
-      break;
-    }
+// Little-endian fixup data byte ordering:
+//   mips32r2:   a | b | x | x
+//   microMIPS:  x | x | a | b
 
-    // Grab current value, if any, from bits.
-    uint64_t CurVal = 0;
+static bool needsMMLEByteOrder(unsigned Kind) {
+  return Kind >= Mips::fixup_MICROMIPS_26_S1 &&
+         Kind < Mips::LastTargetFixupKind;
+}
 
-    for (unsigned i = 0; i != NumBytes; ++i) {
-      unsigned Idx = IsLittle ? i : (FullSize - 1 - i);
-      CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
-    }
+// Calculate index for microMIPS specific little endian byte order
+static unsigned calculateMMLEIndex(unsigned i) {
+  assert(i <= 3 && "Index out of range!");
 
-    uint64_t Mask = ((uint64_t)(-1) >>
-                     (64 - getFixupKindInfo(Kind).TargetSize));
-    CurVal |= Value & Mask;
+  return (1 - i / 2) * 2 + i % 2;
+}
 
-    // Write out the fixed up bytes back to the code/data bits.
-    for (unsigned i = 0; i != NumBytes; ++i) {
-      unsigned Idx = IsLittle ? i : (FullSize - 1 - i);
-      Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
-    }
+/// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
+/// data fragment, at the offset specified by the fixup and following the
+/// fixup kind as appropriate.
+void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                unsigned DataSize, uint64_t Value,
+                                bool IsPCRel) const {
+  MCFixupKind Kind = Fixup.getKind();
+  Value = adjustFixupValue(Fixup, Value);
+
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  // Where do we start in the object
+  unsigned Offset = Fixup.getOffset();
+  // Number of bytes we need to fixup
+  unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+  // Used to point to big endian bytes
+  unsigned FullSize;
+
+  switch ((unsigned)Kind) {
+  case FK_Data_2:
+  case Mips::fixup_Mips_16:
+    FullSize = 2;
+    break;
+  case FK_Data_8:
+  case Mips::fixup_Mips_64:
+    FullSize = 8;
+    break;
+  case FK_Data_4:
+  default:
+    FullSize = 4;
+    break;
   }
 
-  unsigned getNumFixupKinds() const { return Mips::NumTargetFixupKinds; }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
-      // This table *must* be in same the order of fixup_* kinds in
-      // MipsFixupKinds.h.
-      //
-      // name                    offset  bits  flags
-      { "fixup_Mips_16",           0,     16,   0 },
-      { "fixup_Mips_32",           0,     32,   0 },
-      { "fixup_Mips_REL32",        0,     32,   0 },
-      { "fixup_Mips_26",           0,     26,   0 },
-      { "fixup_Mips_HI16",         0,     16,   0 },
-      { "fixup_Mips_LO16",         0,     16,   0 },
-      { "fixup_Mips_GPREL16",      0,     16,   0 },
-      { "fixup_Mips_LITERAL",      0,     16,   0 },
-      { "fixup_Mips_GOT_Global",   0,     16,   0 },
-      { "fixup_Mips_GOT_Local",    0,     16,   0 },
-      { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Mips_CALL16",       0,     16,   0 },
-      { "fixup_Mips_GPREL32",      0,     32,   0 },
-      { "fixup_Mips_SHIFT5",       6,      5,   0 },
-      { "fixup_Mips_SHIFT6",       6,      5,   0 },
-      { "fixup_Mips_64",           0,     64,   0 },
-      { "fixup_Mips_TLSGD",        0,     16,   0 },
-      { "fixup_Mips_GOTTPREL",     0,     16,   0 },
-      { "fixup_Mips_TPREL_HI",     0,     16,   0 },
-      { "fixup_Mips_TPREL_LO",     0,     16,   0 },
-      { "fixup_Mips_TLSLDM",       0,     16,   0 },
-      { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
-      { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
-      { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
-      { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
-      { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
-      { "fixup_Mips_GOT_OFST",     0,     16,   0 },
-      { "fixup_Mips_GOT_DISP",     0,     16,   0 },
-      { "fixup_Mips_HIGHER",       0,     16,   0 },
-      { "fixup_Mips_HIGHEST",      0,     16,   0 },
-      { "fixup_Mips_GOT_HI16",     0,     16,   0 },
-      { "fixup_Mips_GOT_LO16",     0,     16,   0 },
-      { "fixup_Mips_CALL_HI16",    0,     16,   0 },
-      { "fixup_Mips_CALL_LO16",    0,     16,   0 },
-      { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
-      { "fixup_MICROMIPS_HI16",    0,     16,   0 },
-      { "fixup_MICROMIPS_LO16",    0,     16,   0 },
-      { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
-      { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
-      { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
-      { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
-      { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
-      { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
-      { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
-      { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
-      { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
-    };
+  // Grab current value, if any, from bits.
+  uint64_t CurVal = 0;
 
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
+  bool microMipsLEByteOrder = needsMMLEByteOrder((unsigned) Kind);
 
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+                                                    : i)
+                            : (FullSize - 1 - i);
+    CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
   }
 
-  /// @name Target Relaxation Interfaces
-  /// @{
-
-  /// MayNeedRelaxation - Check whether the given instruction may need
-  /// relaxation.
-  ///
-  /// \param Inst - The instruction to test.
-  bool mayNeedRelaxation(const MCInst &Inst) const {
-    return false;
-  }
-
-  /// fixupNeedsRelaxation - Target specific predicate for whether a given
-  /// fixup requires the associated instruction to be relaxed.
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const {
-    // FIXME.
-    assert(0 && "RelaxInstruction() unimplemented");
-    return false;
-  }
+  uint64_t Mask = ((uint64_t)(-1) >>
+                    (64 - getFixupKindInfo(Kind).TargetSize));
+  CurVal |= Value & Mask;
 
-  /// RelaxInstruction - Relax the instruction in the given fragment
-  /// to the next wider instruction.
-  ///
-  /// \param Inst - The instruction to relax, which may be the same
-  /// as the output.
-  /// \param [out] Res On return, the relaxed instruction.
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  // Write out the fixed up bytes back to the code/data bits.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+                                                    : i)
+                            : (FullSize - 1 - i);
+    Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
   }
+}
 
-  /// @}
-
-  /// WriteNopData - Write an (optimal) nop sequence of Count bytes
-  /// to the given output. If the target cannot generate such a sequence,
-  /// it should return an error.
-  ///
-  /// \return - True on success.
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-    // Check for a less than instruction size number of bytes
-    // FIXME: 16 bit instructions are not handled yet here.
-    // We shouldn't be using a hard coded number for instruction size.
-    if (Count % 4) return false;
+const MCFixupKindInfo &MipsAsmBackend::
+getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_16",           0,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           0,     26,   0 },
+    { "fixup_Mips_HI16",         0,     16,   0 },
+    { "fixup_Mips_LO16",         0,     16,   0 },
+    { "fixup_Mips_GPREL16",      0,     16,   0 },
+    { "fixup_Mips_LITERAL",      0,     16,   0 },
+    { "fixup_Mips_GOT_Global",   0,     16,   0 },
+    { "fixup_Mips_GOT_Local",    0,     16,   0 },
+    { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",       0,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",       6,      5,   0 },
+    { "fixup_Mips_SHIFT6",       6,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",        0,     16,   0 },
+    { "fixup_Mips_GOTTPREL",     0,     16,   0 },
+    { "fixup_Mips_TPREL_HI",     0,     16,   0 },
+    { "fixup_Mips_TPREL_LO",     0,     16,   0 },
+    { "fixup_Mips_TLSLDM",       0,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
+    { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
+    { "fixup_Mips_GOT_OFST",     0,     16,   0 },
+    { "fixup_Mips_GOT_DISP",     0,     16,   0 },
+    { "fixup_Mips_HIGHER",       0,     16,   0 },
+    { "fixup_Mips_HIGHEST",      0,     16,   0 },
+    { "fixup_Mips_GOT_HI16",     0,     16,   0 },
+    { "fixup_Mips_GOT_LO16",     0,     16,   0 },
+    { "fixup_Mips_CALL_HI16",    0,     16,   0 },
+    { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
+    { "fixup_MICROMIPS_HI16",    0,     16,   0 },
+    { "fixup_MICROMIPS_LO16",    0,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+    { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+          "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
 
-    uint64_t NumNops = Count / 4;
-    for (uint64_t i = 0; i != NumNops; ++i)
-      OW->Write32(0);
-    return true;
-  }
-}; // class MipsAsmBackend
+/// WriteNopData - Write an (optimal) nop sequence of Count bytes
+/// to the given output. If the target cannot generate such a sequence,
+/// it should return an error.
+///
+/// \return - True on success.
+bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // Check for a less than instruction size number of bytes
+  // FIXME: 16 bit instructions are not handled yet here.
+  // We shouldn't be using a hard coded number for instruction size.
+  if (Count % 4) return false;
+
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(0);
+  return true;
+}
 
-} // namespace
+/// processFixupValue - Target hook to process the literal value of a fixup
+/// if necessary.
+void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const MCFixup &Fixup,
+                                       const MCFragment *DF,
+                                       const MCValue &Target,
+                                       uint64_t &Value,
+                                       bool &IsResolved) {
+  // At this point we'll ignore the value returned by adjustFixupValue as
+  // we are only checking if the fixup can be applied correctly. We have
+  // access to MCContext from here which allows us to report a fatal error
+  // with *possibly* a source code location.
+  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
 
 // MCAsmBackend
 MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
@@ -309,4 +319,3 @@ MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/true);
 }
-
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
new file mode 100644
index 0000000..cc5207a
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -0,0 +1,93 @@
+//===-- MipsAsmBackend.h - Mips Asm Backend  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef MIPSASMBACKEND_H
+#define MIPSASMBACKEND_H
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/Triple.h"
+
+namespace llvm {
+
+class MCAssembler;
+struct MCFixupKindInfo;
+class Target;
+class MCObjectWriter;
+
+class MipsAsmBackend : public MCAsmBackend {
+  Triple::OSType OSType;
+  bool IsLittle; // Big or little endian
+  bool Is64Bit;  // 32 or 64 bit words
+
+public:
+  MipsAsmBackend(const Target &T, Triple::OSType _OSType, bool _isLittle,
+                 bool _is64Bit)
+      : MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle),
+        Is64Bit(_is64Bit) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const;
+
+  unsigned getNumFixupKinds() const {
+    return Mips::NumTargetFixupKinds;
+  }
+
+  /// @name Target Relaxation Interfaces
+  /// @{
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool mayNeedRelaxation(const MCInst &Inst) const {
+    return false;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                             const MCRelaxableFragment *DF,
+                             const MCAsmLayout &Layout) const {
+    // FIXME.
+    assert(0 && "RelaxInstruction() unimplemented");
+    return false;
+  }
+
+  /// RelaxInstruction - Relax the instruction in the given fragment
+  /// to the next wider instruction.
+  ///
+  /// \param Inst - The instruction to relax, which may be the same
+  /// as the output.
+  /// \param [out] Res On return, the relaxed instruction.
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {}
+
+  /// @}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved);
+
+}; // class MipsAsmBackend
+
+} // namespace
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 7a55efd..d2323dc 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -120,34 +120,6 @@ namespace MipsII {
     FormMask = 15
   };
 }
-
-inline static std::pair<const MCSymbolRefExpr*, int64_t>
-MipsGetSymAndOffset(const MCFixup &Fixup) {
-  MCFixupKind FixupKind = Fixup.getKind();
-
-  if ((FixupKind < FirstTargetFixupKind) ||
-      (FixupKind >= MCFixupKind(Mips::LastTargetFixupKind)))
-    return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
-
-  const MCExpr *Expr = Fixup.getValue();
-  MCExpr::ExprKind Kind = Expr->getKind();
-
-  if (Kind == MCExpr::Binary) {
-    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Expr);
-    const MCExpr *LHS = BE->getLHS();
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
-
-    if ((LHS->getKind() != MCExpr::SymbolRef) || !CE)
-      return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
-
-    return std::make_pair(cast<MCSymbolRefExpr>(LHS), CE->getValue());
-  }
-
-  if (Kind != MCExpr::SymbolRef)
-    return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
-
-  return std::make_pair(cast<MCSymbolRefExpr>(Expr), 0);
-}
 }
 
 #endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 83c7d4b..794978b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -21,17 +21,6 @@
 using namespace llvm;
 
 namespace {
-  struct RelEntry {
-    RelEntry(const ELFRelocationEntry &R, const MCSymbol *S, int64_t O) :
-      Reloc(R), Sym(S), Offset(O) {}
-    ELFRelocationEntry Reloc;
-    const MCSymbol *Sym;
-    int64_t Offset;
-  };
-
-  typedef std::list<RelEntry> RelLs;
-  typedef RelLs::iterator RelLsIter;
-
   class MipsELFObjectWriter : public MCELFObjectTargetWriter {
   public:
     MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
@@ -39,16 +28,9 @@ namespace {
 
     virtual ~MipsELFObjectWriter();
 
-    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                  bool IsPCRel, bool IsRelocWithSymbol,
-                                  int64_t Addend) const;
-    virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
-                                           const MCValue &Target,
-                                           const MCFragment &F,
-                                           const MCFixup &Fixup,
-                                           bool IsPCRel) const;
-    virtual void sortRelocs(const MCAssembler &Asm,
-                            std::vector<ELFRelocationEntry> &Relocs);
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+    bool needsRelocateWithSymbol(unsigned Type) const override;
   };
 }
 
@@ -60,26 +42,9 @@ MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
-const MCSymbol *MipsELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
-                                                    const MCValue &Target,
-                                                    const MCFragment &F,
-                                                    const MCFixup &Fixup,
-                                                    bool IsPCRel) const {
-  assert(Target.getSymA() && "SymA cannot be 0.");
-  const MCSymbol &Sym = Target.getSymA()->getSymbol().AliasedSymbol();
-
-  if (Sym.getSection().getKind().isMergeableCString() ||
-      Sym.getSection().getKind().isMergeableConst())
-    return &Sym;
-
-  return NULL;
-}
-
 unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
                                            const MCFixup &Fixup,
-                                           bool IsPCRel,
-                                           bool IsRelocWithSymbol,
-                                           int64_t Addend) const {
+                                           bool IsPCRel) const {
   // determine the type of the relocation
   unsigned Type = (unsigned)ELF::R_MIPS_NONE;
   unsigned Kind = (unsigned)Fixup.getKind();
@@ -210,6 +175,12 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_MICROMIPS_GOT_OFST:
     Type = ELF::R_MICROMIPS_GOT_OFST;
     break;
+  case Mips::fixup_MICROMIPS_TLS_GD:
+    Type = ELF::R_MICROMIPS_TLS_GD;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_LDM:
+    Type = ELF::R_MICROMIPS_TLS_LDM;
+    break;
   case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
     Type = ELF::R_MICROMIPS_TLS_DTPREL_HI16;
     break;
@@ -226,91 +197,39 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   return Type;
 }
 
-// Return true if R is either a GOT16 against a local symbol or HI16.
-static bool NeedsMatchingLo(const MCAssembler &Asm, const RelEntry &R) {
-  if (!R.Sym)
-    return false;
-
-  MCSymbolData &SD = Asm.getSymbolData(R.Sym->AliasedSymbol());
-
-  return ((R.Reloc.Type == ELF::R_MIPS_GOT16) && !SD.isExternal()) ||
-    (R.Reloc.Type == ELF::R_MIPS_HI16);
-}
-
-static bool HasMatchingLo(const MCAssembler &Asm, RelLsIter I, RelLsIter Last) {
-  if (I == Last)
+bool
+MipsELFObjectWriter::needsRelocateWithSymbol(unsigned Type) const {
+  // FIXME: This is extremelly conservative. This really needs to use a
+  // whitelist with a clear explanation for why each realocation needs to
+  // point to the symbol, not to the section.
+  switch (Type) {
+  default:
+    return true;
+
+  case ELF::R_MIPS_GOT16:
+  case ELF::R_MIPS16_GOT16:
+  case ELF::R_MICROMIPS_GOT16:
+    llvm_unreachable("Should have been handled already");
+
+  // These relocations might be paired with another relocation. The pairing is
+  // done by the static linker by matching the symbol. Since we only see one
+  // relocation at a time, we have to force them to relocate with a symbol to
+  // avoid ending up with a pair where one points to a section and another
+  // points to a symbol.
+  case ELF::R_MIPS_HI16:
+  case ELF::R_MIPS16_HI16:
+  case ELF::R_MICROMIPS_HI16:
+  case ELF::R_MIPS_LO16:
+  case ELF::R_MIPS16_LO16:
+  case ELF::R_MICROMIPS_LO16:
+    return true;
+
+  case ELF::R_MIPS_26:
+  case ELF::R_MIPS_32:
+  case ELF::R_MIPS_64:
+  case ELF::R_MIPS_GPREL16:
     return false;
-
-  RelLsIter Hi = I++;
-
-  return (I->Reloc.Type == ELF::R_MIPS_LO16) && (Hi->Sym == I->Sym) &&
-    (Hi->Offset == I->Offset);
-}
-
-static bool HasSameSymbol(const RelEntry &R0, const RelEntry &R1) {
-  return R0.Sym == R1.Sym;
-}
-
-static int CompareOffset(const RelEntry &R0, const RelEntry &R1) {
-  return (R0.Offset > R1.Offset) ? 1 : ((R0.Offset == R1.Offset) ? 0 : -1);
-}
-
-void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
-                                     std::vector<ELFRelocationEntry> &Relocs) {
-  // Call the default function first. Relocations are sorted in descending
-  // order of r_offset.
-  MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
-
-  RelLs RelocLs;
-  std::vector<RelLsIter> Unmatched;
-
-  // Fill RelocLs. Traverse Relocs backwards so that relocations in RelocLs
-  // are in ascending order of r_offset.
-  for (std::vector<ELFRelocationEntry>::reverse_iterator R = Relocs.rbegin();
-       R != Relocs.rend(); ++R) {
-     std::pair<const MCSymbolRefExpr*, int64_t> P =
-       MipsGetSymAndOffset(*R->Fixup);
-     RelocLs.push_back(RelEntry(*R, P.first ? &P.first->getSymbol() : 0,
-                                P.second));
-  }
-
-  // Get list of unmatched HI16 and GOT16.
-  for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R)
-    if (NeedsMatchingLo(Asm, *R) && !HasMatchingLo(Asm, R, --RelocLs.end()))
-      Unmatched.push_back(R);
-
-  // Insert unmatched HI16 and GOT16 immediately before their matching LO16.
-  for (std::vector<RelLsIter>::iterator U = Unmatched.begin();
-       U != Unmatched.end(); ++U) {
-    RelLsIter LoPos = RelocLs.end(), HiPos = *U;
-    bool MatchedLo = false;
-
-    for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R) {
-      if ((R->Reloc.Type == ELF::R_MIPS_LO16) && HasSameSymbol(*HiPos, *R) &&
-          (CompareOffset(*R, *HiPos) >= 0) &&
-          ((LoPos == RelocLs.end()) || ((CompareOffset(*R, *LoPos) < 0)) ||
-           (!MatchedLo && !CompareOffset(*R, *LoPos))))
-        LoPos = R;
-
-      MatchedLo = NeedsMatchingLo(Asm, *R) &&
-        HasMatchingLo(Asm, R, --RelocLs.end());
-    }
-
-    // If a matching LoPos was found, move HiPos and insert it before LoPos.
-    // Make the offsets of HiPos and LoPos match.
-    if (LoPos != RelocLs.end()) {
-      HiPos->Offset = LoPos->Offset;
-      RelocLs.insert(LoPos, *HiPos);
-      RelocLs.erase(HiPos);
-    }
   }
-
-  // Put the sorted list back in reverse order.
-  assert(Relocs.size() == RelocLs.size());
-  unsigned I = RelocLs.size();
-
-  for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R)
-    Relocs[--I] = R->Reloc;
 }
 
 MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
new file mode 100644
index 0000000..fe37829
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -0,0 +1,19 @@
+//===-------- MipsELFStreamer.cpp - ELF Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsELFStreamer.h"
+
+namespace llvm {
+MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_ostream &OS, MCCodeEmitter *Emitter,
+                                     const MCSubtargetInfo &STI, bool RelaxAll,
+                                     bool NoExecStack) {
+  return new MipsELFStreamer(Context, MAB, OS, Emitter, STI);
+}
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
new file mode 100644
index 0000000..641f8cf
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -0,0 +1,42 @@
+//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSELFSTREAMER_H
+#define MIPSELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class MipsELFStreamer : public MCELFStreamer {
+
+public:
+  MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
+                  MCCodeEmitter *Emitter, const MCSubtargetInfo &STI)
+      : MCELFStreamer(Context, MAB, OS, Emitter) {}
+
+  virtual ~MipsELFStreamer() {}
+};
+
+MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_ostream &OS, MCCodeEmitter *Emitter,
+                                     const MCSubtargetInfo &STI, bool RelaxAll,
+                                     bool NoExecStack);
+} // namespace llvm.
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 6ed44b7..dc6192c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -155,6 +155,12 @@ namespace Mips {
     // resulting in - R_MICROMIPS_GOT_OFST
     fixup_MICROMIPS_GOT_OFST,
 
+    // resulting in - R_MICROMIPS_TLS_GD
+    fixup_MICROMIPS_TLS_GD,
+
+    // resulting in - R_MICROMIPS_TLS_LDM
+    fixup_MICROMIPS_TLS_LDM,
+
     // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
     fixup_MICROMIPS_TLS_DTPREL_HI16,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 66428bd..edd2146 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -11,137 +11,42 @@
 //
 //===----------------------------------------------------------------------===//
 //
+
 #define DEBUG_TYPE "mccodeemitter"
-#include "MCTargetDesc/MipsBaseInfo.h"
+
+#include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRMAP_INFO
 #include "MipsGenInstrInfo.inc"
-
-using namespace llvm;
-
-namespace {
-class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  const MCInstrInfo &MCII;
-  MCContext &Ctx;
-  const MCSubtargetInfo &STI;
-  bool IsLittleEndian;
-  bool IsMicroMips;
-
-public:
-  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_,
-                    const MCSubtargetInfo &sti, bool IsLittle) :
-    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {
-      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
-    }
-
-  ~MipsMCCodeEmitter() {}
-
-  void EmitByte(unsigned char C, raw_ostream &OS) const {
-    OS << (char)C;
-  }
-
-  void EmitInstruction(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the instruction encoding in little endian byte order.
-    // Little-endian byte ordering:
-    //   mips32r2:   4 | 3 | 2 | 1
-    //   microMIPS:  2 | 1 | 4 | 3
-    if (IsLittleEndian && Size == 4 && IsMicroMips) {
-      EmitInstruction(Val>>16, 2, OS);
-      EmitInstruction(Val, 2, OS);
-    } else {
-      for (unsigned i = 0; i < Size; ++i) {
-        unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
-        EmitByte((Val >> Shift) & 0xff, OS);
-      }
-    }
-  }
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
-
-  // getBranchJumpOpValue - Return binary encoding of the jump
-  // target operand. If the machine operand requires relocation,
-  // record the relocation and return zero.
-   unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
-
-  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
-  // target operand. If the machine operand requires relocation,
-  // record the relocation and return zero.
-  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
-
-   // getBranchTargetOpValue - Return binary encoding of the branch
-   // target operand. If the machine operand requires relocation,
-   // record the relocation and return zero.
-  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
-
-  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
-  // target operand. If the machine operand requires relocation,
-  // record the relocation and return zero.
-  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
-                                    SmallVectorImpl<MCFixup> &Fixups) const;
-
-   // getMachineOpValue - Return binary encoding of operand. If the machin
-   // operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
-                          SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-
-  // getLSAImmEncoding - Return binary encoding of LSA immediate.
-  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned
-  getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const;
-
-}; // class MipsMCCodeEmitter
-}  // namespace
-
-MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI,
-                                               MCContext &Ctx)
-{
-  return new MipsMCCodeEmitter(MCII, Ctx, STI, false);
+#undef GET_INSTRMAP_INFO
+
+namespace llvm {
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, Ctx, false);
 }
 
-MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI,
-                                               MCContext &Ctx)
-{
-  return new MipsMCCodeEmitter(MCII, Ctx, STI, true);
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, Ctx, true);
 }
-
+} // End of namespace llvm.
 
 // If the D<shift> instruction has a shift amount that is greater
 // than 31 (checked in calling routine), lower it to a D<shift>32 instruction
@@ -208,11 +113,38 @@ static void LowerDextDins(MCInst& InstIn) {
   return;
 }
 
+bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
+  return STI.getFeatureBits() & Mips::FeatureMicroMips;
+}
+
+void MipsMCCodeEmitter::EmitByte(unsigned char C, raw_ostream &OS) const {
+  OS << (char)C;
+}
+
+void MipsMCCodeEmitter::EmitInstruction(uint64_t Val, unsigned Size,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &OS) const {
+  // Output the instruction encoding in little endian byte order.
+  // Little-endian byte ordering:
+  //   mips32r2:   4 | 3 | 2 | 1
+  //   microMIPS:  2 | 1 | 4 | 3
+  if (IsLittleEndian && Size == 4 && isMicroMips(STI)) {
+    EmitInstruction(Val >> 16, 2, STI, OS);
+    EmitInstruction(Val, 2, STI, OS);
+  } else {
+    for (unsigned i = 0; i < Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
+    }
+  }
+}
+
 /// EncodeInstruction - Emit the instruction.
 /// Size the instruction with Desc.getSize().
 void MipsMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const
 {
 
   // Non-pseudo instructions that get changed for direct object
@@ -235,7 +167,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   unsigned long N = Fixups.size();
-  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
+  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
 
   // Check for unimplemented opcodes.
   // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
@@ -251,7 +183,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
         Fixups.pop_back();
       Opcode = NewOpcode;
       TmpInst.setOpcode (NewOpcode);
-      Binary = getBinaryCodeForInstr(TmpInst, Fixups);
+      Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
     }
   }
 
@@ -262,7 +194,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (!Size)
     llvm_unreachable("Desc.getSize() returns 0");
 
-  EmitInstruction(Binary, Size, OS);
+  EmitInstruction(Binary, Size, STI, OS);
 }
 
 /// getBranchTargetOpValue - Return binary encoding of the branch
@@ -270,7 +202,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 /// record the relocation and return zero.
 unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
 
@@ -291,7 +224,8 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
 /// record the relocation and return zero.
 unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
 
@@ -313,7 +247,8 @@ getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
 /// record the relocation and return zero.
 unsigned MipsMCCodeEmitter::
 getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 4.
@@ -330,7 +265,8 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
 
 unsigned MipsMCCodeEmitter::
 getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 2.
@@ -346,7 +282,8 @@ getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
-getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
+getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
+               const MCSubtargetInfo &STI) const {
   int64_t Res;
 
   if (Expr->EvaluateAsAbsolute(Res))
@@ -358,101 +295,129 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
   }
 
   if (Kind == MCExpr::Binary) {
-    unsigned Res = getExprOpValue(cast<MCBinaryExpr>(Expr)->getLHS(), Fixups);
-    Res += getExprOpValue(cast<MCBinaryExpr>(Expr)->getRHS(), Fixups);
+    unsigned Res = getExprOpValue(cast<MCBinaryExpr>(Expr)->getLHS(), Fixups, STI);
+    Res += getExprOpValue(cast<MCBinaryExpr>(Expr)->getRHS(), Fixups, STI);
     return Res;
   }
-  if (Kind == MCExpr::SymbolRef) {
-  Mips::Fixups FixupKind = Mips::Fixups(0);
 
-  switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
-  default: llvm_unreachable("Unknown fixup kind!");
-    break;
-  case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
-    FixupKind = Mips::fixup_Mips_GPOFF_HI;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
-    FixupKind = Mips::fixup_Mips_GPOFF_LO;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_PAGE
-                            : Mips::fixup_Mips_GOT_PAGE;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT_OFST :
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_OFST
-                            : Mips::fixup_Mips_GOT_OFST;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT_DISP :
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_DISP
-                            : Mips::fixup_Mips_GOT_DISP;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GPREL:
-    FixupKind = Mips::fixup_Mips_GPREL16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT_CALL:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_CALL16
-                            : Mips::fixup_Mips_CALL16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT16:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
-                            : Mips::fixup_Mips_GOT_Global;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
-                            : Mips::fixup_Mips_GOT_Local;
-    break;
-  case MCSymbolRefExpr::VK_Mips_ABS_HI:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_HI16
-                            : Mips::fixup_Mips_HI16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_ABS_LO:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_LO16
-                            : Mips::fixup_Mips_LO16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_TLSGD:
-    FixupKind = Mips::fixup_Mips_TLSGD;
-    break;
-  case MCSymbolRefExpr::VK_Mips_TLSLDM:
-    FixupKind = Mips::fixup_Mips_TLSLDM;
-    break;
-  case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
-                            : Mips::fixup_Mips_DTPREL_HI;
-    break;
-  case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
-                            : Mips::fixup_Mips_DTPREL_LO;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOTTPREL:
-    FixupKind = Mips::fixup_Mips_GOTTPREL;
-    break;
-  case MCSymbolRefExpr::VK_Mips_TPREL_HI:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
-                            : Mips::fixup_Mips_TPREL_HI;
-    break;
-  case MCSymbolRefExpr::VK_Mips_TPREL_LO:
-    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
-                            : Mips::fixup_Mips_TPREL_LO;
-    break;
-  case MCSymbolRefExpr::VK_Mips_HIGHER:
-    FixupKind = Mips::fixup_Mips_HIGHER;
-    break;
-  case MCSymbolRefExpr::VK_Mips_HIGHEST:
-    FixupKind = Mips::fixup_Mips_HIGHEST;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT_HI16:
-    FixupKind = Mips::fixup_Mips_GOT_HI16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_GOT_LO16:
-    FixupKind = Mips::fixup_Mips_GOT_LO16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_CALL_HI16:
-    FixupKind = Mips::fixup_Mips_CALL_HI16;
-    break;
-  case MCSymbolRefExpr::VK_Mips_CALL_LO16:
-    FixupKind = Mips::fixup_Mips_CALL_LO16;
-    break;
-  } // switch
+  if (Kind == MCExpr::Target) {
+    const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+    switch (MipsExpr->getKind()) {
+    default: llvm_unreachable("Unsupported fixup kind for target expression!");
+    case MipsMCExpr::VK_Mips_HIGHEST:
+      FixupKind = Mips::fixup_Mips_HIGHEST;
+      break;
+    case MipsMCExpr::VK_Mips_HIGHER:
+      FixupKind = Mips::fixup_Mips_HIGHER;
+      break;
+    case MipsMCExpr::VK_Mips_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+                                   : Mips::fixup_Mips_HI16;
+      break;
+    case MipsMCExpr::VK_Mips_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                                   : Mips::fixup_Mips_LO16;
+      break;
+    }
+    Fixups.push_back(MCFixup::Create(0, MipsExpr, MCFixupKind(FixupKind)));
+    return 0;
+  }
+
+  if (Kind == MCExpr::SymbolRef) {
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+
+    switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+    default: llvm_unreachable("Unknown fixup kind!");
+      break;
+    case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
+      FixupKind = Mips::fixup_Mips_GPOFF_HI;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
+      FixupKind = Mips::fixup_Mips_GPOFF_LO;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
+                              : Mips::fixup_Mips_GOT_PAGE;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_OFST :
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
+                              : Mips::fixup_Mips_GOT_OFST;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_DISP :
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
+                              : Mips::fixup_Mips_GOT_DISP;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GPREL:
+      FixupKind = Mips::fixup_Mips_GPREL16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_CALL:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
+                              : Mips::fixup_Mips_CALL16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT16:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+                              : Mips::fixup_Mips_GOT_Global;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+                              : Mips::fixup_Mips_GOT_Local;
+      break;
+    case MCSymbolRefExpr::VK_Mips_ABS_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+                              : Mips::fixup_Mips_HI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_ABS_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                              : Mips::fixup_Mips_LO16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TLSGD:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
+                              : Mips::fixup_Mips_TLSGD;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TLSLDM:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
+                              : Mips::fixup_Mips_TLSLDM;
+      break;
+    case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                              : Mips::fixup_Mips_DTPREL_HI;
+      break;
+    case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                              : Mips::fixup_Mips_DTPREL_LO;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOTTPREL:
+      FixupKind = Mips::fixup_Mips_GOTTPREL;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+                              : Mips::fixup_Mips_TPREL_HI;
+      break;
+    case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+                              : Mips::fixup_Mips_TPREL_LO;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHER:
+      FixupKind = Mips::fixup_Mips_HIGHER;
+      break;
+    case MCSymbolRefExpr::VK_Mips_HIGHEST:
+      FixupKind = Mips::fixup_Mips_HIGHEST;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_HI16:
+      FixupKind = Mips::fixup_Mips_GOT_HI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_GOT_LO16:
+      FixupKind = Mips::fixup_Mips_GOT_LO16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_CALL_HI16:
+      FixupKind = Mips::fixup_Mips_CALL_HI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_CALL_LO16:
+      FixupKind = Mips::fixup_Mips_CALL_LO16;
+      break;
+    } // switch
 
     Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
     return 0;
@@ -464,7 +429,8 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
 /// operand requires relocation, record the relocation and return zero.
 unsigned MipsMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
     unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
@@ -477,38 +443,85 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   }
   // MO must be an Expr.
   assert(MO.isExpr());
-  return getExprOpValue(MO.getExpr(),Fixups);
+  return getExprOpValue(MO.getExpr(),Fixups, STI);
+}
+
+/// getMSAMemEncoding - Return binary encoding of memory operand for LD/ST
+/// instructions.
+unsigned
+MipsMCCodeEmitter::getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  // The immediate field of an LD/ST instruction is scaled which means it must
+  // be divided (when encoding) by the size (in bytes) of the instructions'
+  // data format.
+  // .b - 1 byte
+  // .h - 2 bytes
+  // .w - 4 bytes
+  // .d - 8 bytes
+  switch(MI.getOpcode())
+  {
+  default:
+    assert (0 && "Unexpected instruction");
+    break;
+  case Mips::LD_B:
+  case Mips::ST_B:
+    // We don't need to scale the offset in this case
+    break;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    OffBits >>= 1;
+    break;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    OffBits >>= 2;
+    break;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    OffBits >>= 3;
+    break;
+  }
+
+  return (OffBits & 0xFFFF) | RegBits;
 }
 
 /// getMemEncoding - Return binary encoding of memory related operand.
 /// If the offset operand requires relocation, record the relocation.
 unsigned
 MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups) const {
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const {
   // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
   assert(MI.getOperand(OpNo).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups) << 16;
-  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups);
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
 
   return (OffBits & 0xFFFF) | RegBits;
 }
 
 unsigned MipsMCCodeEmitter::
 getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
-                      SmallVectorImpl<MCFixup> &Fixups) const {
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
   // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
   assert(MI.getOperand(OpNo).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) << 16;
-  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups);
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
 
   return (OffBits & 0x0FFF) | RegBits;
 }
 
 unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
-                                      SmallVectorImpl<MCFixup> &Fixups) const {
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
   assert(MI.getOperand(OpNo).isImm());
-  unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+  unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
   return SizeEncoding - 1;
 }
 
@@ -516,21 +529,23 @@ MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
 //
 unsigned
 MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
-                                      SmallVectorImpl<MCFixup> &Fixups) const {
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
   assert(MI.getOperand(OpNo-1).isImm());
   assert(MI.getOperand(OpNo).isImm());
-  unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups);
-  unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+  unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups, STI);
+  unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
 
   return Position + Size - 1;
 }
 
 unsigned
 MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups) const {
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
   assert(MI.getOperand(OpNo).isImm());
   // The immediate is encoded as 'immediate - 1'.
-  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) - 1;
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1;
 }
 
 #include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
new file mode 100644
index 0000000..49a2490
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -0,0 +1,125 @@
+//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef MIPS_MC_CODE_EMITTER_H
+#define MIPS_MC_CODE_EMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/DataTypes.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCInstrInfo;
+class MCFixup;
+class MCOperand;
+class MCSubtargetInfo;
+class raw_ostream;
+
+class MipsMCCodeEmitter : public MCCodeEmitter {
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+  bool IsLittleEndian;
+
+  bool isMicroMips(const MCSubtargetInfo &STI) const;
+
+public:
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
+      : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
+
+  ~MipsMCCodeEmitter() {}
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const;
+
+  void EmitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+                       raw_ostream &OS) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // getBranchJumpOpValue - Return binary encoding of the jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machin
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  // getLSAImmEncoding - Return binary encoding of LSA immediate.
+  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
+}; // class MipsMCCodeEmitter
+} // namespace llvm.
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
new file mode 100644
index 0000000..c7ba12d
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -0,0 +1,114 @@
+//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mipsmcexpr"
+#include "MipsMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+
+using namespace llvm;
+
+bool MipsMCExpr::isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
+                                       const MCBinaryExpr *BE) {
+  switch (VK) {
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:
+  case MCSymbolRefExpr::VK_Mips_HIGHER:
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:
+    break;
+  default:
+    return false;
+  }
+
+  // We support expressions of the form "(sym1 binop1 sym2) binop2 const",
+  // where "binop2 const" is optional.
+  if (isa<MCBinaryExpr>(BE->getLHS())) {
+    if (!isa<MCConstantExpr>(BE->getRHS()))
+      return false;
+    BE = cast<MCBinaryExpr>(BE->getLHS());
+  }
+  return (isa<MCSymbolRefExpr>(BE->getLHS())
+          && isa<MCSymbolRefExpr>(BE->getRHS()));
+}
+
+const MipsMCExpr*
+MipsMCExpr::Create(MCSymbolRefExpr::VariantKind VK, const MCExpr *Expr,
+                   MCContext &Ctx) {
+  VariantKind Kind;
+  switch (VK) {
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:
+    Kind = VK_Mips_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:
+    Kind = VK_Mips_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:
+    Kind = VK_Mips_HIGHER;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:
+    Kind = VK_Mips_HIGHEST;
+    break;
+  default:
+    llvm_unreachable("Invalid kind!");
+  }
+
+  return new (Ctx) MipsMCExpr(Kind, Expr);
+}
+
+void MipsMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_Mips_LO: OS << "%lo"; break;
+  case VK_Mips_HI: OS << "%hi"; break;
+  case VK_Mips_HIGHER: OS << "%higher"; break;
+  case VK_Mips_HIGHEST: OS << "%highest"; break;
+  }
+
+  OS << '(';
+  Expr->print(OS);
+  OS << ')';
+}
+
+bool
+MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                      const MCAsmLayout *Layout) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void MipsMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
new file mode 100644
index 0000000..722bba7
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -0,0 +1,66 @@
+//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCEXPR_H
+#define MIPSMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class MipsMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_Mips_None,
+    VK_Mips_LO,
+    VK_Mips_HI,
+    VK_Mips_HIGHER,
+    VK_Mips_HIGHEST
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit MipsMCExpr(VariantKind Kind, const MCExpr *Expr)
+    : Kind(Kind), Expr(Expr) {}
+
+public:
+  static bool isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
+                                    const MCBinaryExpr *BE);
+
+  static const MipsMCExpr *Create(MCSymbolRefExpr::VariantKind VK,
+                                  const MCExpr *Expr, MCContext &Ctx);
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS MipsMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
new file mode 100644
index 0000000..6992d06
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -0,0 +1,33 @@
+//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCNACL_H
+#define MIPSMCNACL_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+// Log2 of the NaCl MIPS sandbox's instruction bundle size.
+static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+                                  bool *IsStore = NULL);
+bool baseRegNeedsLoadStoreMask(unsigned Reg);
+
+// This function creates an MCELFStreamer for Mips NaCl.
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                         raw_ostream &OS,
+                                         MCCodeEmitter *Emitter,
+                                         const MCSubtargetInfo &STI,
+                                         bool RelaxAll, bool NoExecStack);
+
+}
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 5548aaa..eecca68 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsMCTargetDesc.h"
 #include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
 #include "MipsMCAsmInfo.h"
+#include "MipsMCNaCl.h"
+#include "MipsMCTargetDesc.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -39,36 +41,18 @@
 
 using namespace llvm;
 
-static std::string ParseMipsTriple(StringRef TT, StringRef CPU) {
-  std::string MipsArchFeature;
-  size_t DashPosition = 0;
-  StringRef TheTriple;
-
-  // Let's see if there is a dash, like mips-unknown-linux.
-  DashPosition = TT.find('-');
-
-  if (DashPosition == StringRef::npos) {
-    // No dash, we check the string size.
-    TheTriple = TT.substr(0);
-  } else {
-    // We are only interested in substring before dash.
-    TheTriple = TT.substr(0,DashPosition);
-  }
-
-  if (TheTriple == "mips" || TheTriple == "mipsel") {
-    if (CPU.empty() || CPU == "mips32") {
-      MipsArchFeature = "+mips32";
-    } else if (CPU == "mips32r2") {
-      MipsArchFeature = "+mips32r2";
-    }
-  } else {
-      if (CPU.empty() || CPU == "mips64") {
-        MipsArchFeature = "+mips64";
-      } else if (CPU == "mips64r2") {
-        MipsArchFeature = "+mips64r2";
-      }
+/// Select the Mips CPU for the given triple and cpu name.
+/// FIXME: Merge with the copy in MipsSubtarget.cpp
+static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    Triple TheTriple(TT);
+    if (TheTriple.getArch() == Triple::mips ||
+        TheTriple.getArch() == Triple::mipsel)
+      CPU = "mips32";
+    else
+      CPU = "mips64";
   }
-  return MipsArchFeature;
+  return CPU;
 }
 
 static MCInstrInfo *createMipsMCInstrInfo() {
@@ -85,15 +69,9 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
-  std::string ArchFS = ParseMipsTriple(TT,CPU);
-  if (!FS.empty()) {
-    if (!ArchFS.empty())
-      ArchFS = ArchFS + "," + FS.str();
-    else
-      ArchFS = FS;
-  }
+  CPU = selectMipsCPU(TT, CPU);
   MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMipsMCSubtargetInfo(X, TT, CPU, ArchFS);
+  InitMipsMCSubtargetInfo(X, TT, CPU, FS);
   return X;
 }
 
@@ -131,21 +109,29 @@ static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Context, MCAsmBackend &MAB,
                                     raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
                                     bool RelaxAll, bool NoExecStack) {
-  MipsTargetELFStreamer *S = new MipsTargetELFStreamer();
-  return createELFStreamer(Context, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  MCStreamer *S;
+  if (!Triple(TT).isOSNaCl())
+    S = createMipsELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll,
+                              NoExecStack);
+  else
+    S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll,
+                                  NoExecStack);
+  new MipsTargetELFStreamer(*S, STI);
+  return S;
 }
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useLoc, bool useCFI,
-                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
-                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
-  MipsTargetAsmStreamer *S = new MipsTargetAsmStreamer(OS);
-
-  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
-                                 useDwarfDirectory, InstPrint, CE, TAB,
-                                 ShowInst);
+                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                    MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S =
+      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
+                              InstPrint, CE, TAB, ShowInst);
+  new MipsTargetAsmStreamer(*S, OS);
+  return S;
 }
 
 extern "C" void LLVMInitializeMipsTargetMC() {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index eabebfe..161d1ea 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -42,14 +42,18 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB32(const Target &T, const MCRegisterInfo &MRI,
-                                       StringRef TT, StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL32(const Target &T, const MCRegisterInfo &MRI,
-                                       StringRef TT, StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEB64(const Target &T, const MCRegisterInfo &MRI,
-                                       StringRef TT, StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL64(const Target &T, const MCRegisterInfo &MRI,
-                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T,
+                                       const MCRegisterInfo &MRI, StringRef TT,
+                                       StringRef CPU);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
new file mode 100644
index 0000000..639a058
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -0,0 +1,255 @@
+//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements MCELFStreamer for Mips NaCl.  It emits .o object files
+// as required by NaCl's SFI sandbox.  It inserts address-masking instructions
+// before dangerous control-flow and memory access instructions.  It inserts
+// address-masking instructions after instructions that change the stack
+// pointer.  It ensures that the mask and the dangerous instruction are always
+// emitted in the same bundle.  It aligns call + branch delay to the bundle end,
+// so that return address is always aligned to the start of next bundle.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-mc-nacl"
+
+#include "Mips.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCNaCl.h"
+#include "llvm/MC/MCELFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+
+const unsigned IndirectBranchMaskReg = Mips::T6;
+const unsigned LoadStoreStackMaskReg = Mips::T7;
+
+/// Extend the generic MCELFStreamer class so that it can mask dangerous
+/// instructions.
+
+class MipsNaClELFStreamer : public MipsELFStreamer {
+public:
+  MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                      MCCodeEmitter *Emitter, const MCSubtargetInfo &STI)
+    : MipsELFStreamer(Context, TAB, OS, Emitter, STI), PendingCall(false) {}
+
+  ~MipsNaClELFStreamer() {}
+
+private:
+  // Whether we started the sandboxing sequence for calls.  Calls are bundled
+  // with branch delays and aligned to the bundle end.
+  bool PendingCall;
+
+  bool isIndirectJump(const MCInst &MI) {
+    return MI.getOpcode() == Mips::JR || MI.getOpcode() == Mips::RET;
+  }
+
+  bool isStackPointerFirstOperand(const MCInst &MI) {
+    return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()
+            && MI.getOperand(0).getReg() == Mips::SP);
+  }
+
+  bool isCall(unsigned Opcode, bool *IsIndirectCall) {
+    *IsIndirectCall = false;
+
+    switch (Opcode) {
+    default:
+      return false;
+
+    case Mips::JAL:
+    case Mips::BAL_BR:
+    case Mips::BLTZAL:
+    case Mips::BGEZAL:
+      return true;
+
+    case Mips::JALR:
+      *IsIndirectCall = true;
+      return true;
+    }
+  }
+
+  void emitMask(unsigned AddrReg, unsigned MaskReg,
+                const MCSubtargetInfo &STI) {
+    MCInst MaskInst;
+    MaskInst.setOpcode(Mips::AND);
+    MaskInst.addOperand(MCOperand::CreateReg(AddrReg));
+    MaskInst.addOperand(MCOperand::CreateReg(AddrReg));
+    MaskInst.addOperand(MCOperand::CreateReg(MaskReg));
+    MipsELFStreamer::EmitInstruction(MaskInst, STI);
+  }
+
+  // Sandbox indirect branch or return instruction by inserting mask operation
+  // before it.
+  void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
+    unsigned AddrReg = MI.getOperand(0).getReg();
+
+    EmitBundleLock(false);
+    emitMask(AddrReg, IndirectBranchMaskReg, STI);
+    MipsELFStreamer::EmitInstruction(MI, STI);
+    EmitBundleUnlock();
+  }
+
+  // Sandbox memory access or SP change.  Insert mask operation before and/or
+  // after the instruction.
+  void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
+                                   const MCSubtargetInfo &STI, bool MaskBefore,
+                                   bool MaskAfter) {
+    EmitBundleLock(false);
+    if (MaskBefore) {
+      // Sandbox memory access.
+      unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
+      emitMask(BaseReg, LoadStoreStackMaskReg, STI);
+    }
+    MipsELFStreamer::EmitInstruction(MI, STI);
+    if (MaskAfter) {
+      // Sandbox SP change.
+      unsigned SPReg = MI.getOperand(0).getReg();
+      assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
+      emitMask(SPReg, LoadStoreStackMaskReg, STI);
+    }
+    EmitBundleUnlock();
+  }
+
+public:
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer.  We override it to mask dangerous instructions.
+  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
+    // Sandbox indirect jumps.
+    if (isIndirectJump(Inst)) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+      sandboxIndirectJump(Inst, STI);
+      return;
+    }
+
+    // Sandbox loads, stores and SP changes.
+    unsigned AddrIdx;
+    bool IsStore;
+    bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx,
+                                                    &IsStore);
+    bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
+    if (IsMemAccess || IsSPFirstOperand) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+
+      bool MaskBefore = (IsMemAccess
+                         && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
+                                                          .getReg()));
+      bool MaskAfter = IsSPFirstOperand && !IsStore;
+      if (MaskBefore || MaskAfter)
+        sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
+      else
+        MipsELFStreamer::EmitInstruction(Inst, STI);
+      return;
+    }
+
+    // Sandbox calls by aligning call and branch delay to the bundle end.
+    // For indirect calls, emit the mask before the call.
+    bool IsIndirectCall;
+    if (isCall(Inst.getOpcode(), &IsIndirectCall)) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+
+      // Start the sandboxing sequence by emitting call.
+      EmitBundleLock(true);
+      if (IsIndirectCall) {
+        unsigned TargetReg = Inst.getOperand(1).getReg();
+        emitMask(TargetReg, IndirectBranchMaskReg, STI);
+      }
+      MipsELFStreamer::EmitInstruction(Inst, STI);
+      PendingCall = true;
+      return;
+    }
+    if (PendingCall) {
+      // Finish the sandboxing sequence by emitting branch delay.
+      MipsELFStreamer::EmitInstruction(Inst, STI);
+      EmitBundleUnlock();
+      PendingCall = false;
+      return;
+    }
+
+    // None of the sandboxing applies, just emit the instruction.
+    MipsELFStreamer::EmitInstruction(Inst, STI);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+                                  bool *IsStore) {
+  if (IsStore)
+    *IsStore = false;
+
+  switch (Opcode) {
+  default:
+    return false;
+
+  // Load instructions with base address register in position 1.
+  case Mips::LB:
+  case Mips::LBu:
+  case Mips::LH:
+  case Mips::LHu:
+  case Mips::LW:
+  case Mips::LWC1:
+  case Mips::LDC1:
+  case Mips::LL:
+  case Mips::LWL:
+  case Mips::LWR:
+    *AddrIdx = 1;
+    return true;
+
+  // Store instructions with base address register in position 1.
+  case Mips::SB:
+  case Mips::SH:
+  case Mips::SW:
+  case Mips::SWC1:
+  case Mips::SDC1:
+  case Mips::SWL:
+  case Mips::SWR:
+    *AddrIdx = 1;
+    if (IsStore)
+      *IsStore = true;
+    return true;
+
+  // Store instructions with base address register in position 2.
+  case Mips::SC:
+    *AddrIdx = 2;
+    if (IsStore)
+      *IsStore = true;
+    return true;
+  }
+}
+
+bool baseRegNeedsLoadStoreMask(unsigned Reg) {
+  // The contents of SP and thread pointer register do not require masking.
+  return Reg != Mips::SP && Reg != Mips::T8;
+}
+
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                         raw_ostream &OS,
+                                         MCCodeEmitter *Emitter,
+                                         const MCSubtargetInfo &STI,
+                                         bool RelaxAll, bool NoExecStack) {
+  MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter,
+                                                   STI);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+
+  // Set bundle-alignment as required by the NaCl ABI for the target.
+  S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
+
+  return S;
+}
+
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp
deleted file mode 100644
index 1dc9bcb..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===-- MipsReginfo.cpp - Registerinfo handling  --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-// .reginfo
-//    Elf32_Word ri_gprmask
-//    Elf32_Word ri_cprmask[4]
-//    Elf32_Word ri_gp_value
-//
-// .MIPS.options - N64
-//    Elf64_Byte    kind (ODK_REGINFO)
-//    Elf64_Byte    size (40 bytes)
-//    Elf64_Section section (0)
-//    Elf64_Word    info (unused)
-//    Elf64_Word    ri_gprmask ()
-//    Elf64_Word    ri_pad ()
-//    Elf64_Word[4] ri_cprmask ()
-//    Elf64_Addr    ri_gp_value ()
-//
-// .MIPS.options - N32
-//    Elf32_Byte    kind (ODK_REGINFO)
-//    Elf32_Byte    size (36 bytes)
-//    Elf32_Section section (0)
-//    Elf32_Word    info (unused)
-//    Elf32_Word    ri_gprmask ()
-//    Elf32_Word    ri_pad ()
-//    Elf32_Word[4] ri_cprmask ()
-//    Elf32_Addr    ri_gp_value ()
-//
-//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/MipsReginfo.h"
-#include "MipsSubtarget.h"
-#include "MipsTargetObjectFile.h"
-#include "llvm/MC/MCStreamer.h"
-
-using namespace llvm;
-
-// Integrated assembler version
-void
-MipsReginfo::emitMipsReginfoSectionCG(MCStreamer &OS,
-    const TargetLoweringObjectFile &TLOF,
-    const MipsSubtarget &MST) const
-{
-
-  if (OS.hasRawTextSupport())
-    return;
-
-  const MipsTargetObjectFile &TLOFELF =
-      static_cast<const MipsTargetObjectFile &>(TLOF);
-  OS.SwitchSection(TLOFELF.getReginfoSection());
-
-  // .reginfo
-  if (MST.isABI_O32()) {
-    OS.EmitIntValue(0, 4); // ri_gprmask
-    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
-    OS.EmitIntValue(0, 4); // ri_gp_value
-  }
-  // .MIPS.options
-  else if (MST.isABI_N64()) {
-    OS.EmitIntValue(1, 1); // kind
-    OS.EmitIntValue(40, 1); // size
-    OS.EmitIntValue(0, 2); // section
-    OS.EmitIntValue(0, 4); // info
-    OS.EmitIntValue(0, 4); // ri_gprmask
-    OS.EmitIntValue(0, 4); // pad
-    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
-    OS.EmitIntValue(0, 8); // ri_gp_value
-  }
-  else llvm_unreachable("Unsupported abi for reginfo");
-}
-
diff --git a/lib/Target/Mips/MCTargetDesc/MipsReginfo.h b/lib/Target/Mips/MCTargetDesc/MipsReginfo.h
deleted file mode 100644
index 039b8ea..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsReginfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//=== MipsReginfo.h - MipsReginfo -----------------------------------------===//
-//
-//                    The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENCE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSREGINFO_H
-#define MIPSREGINFO_H
-
-namespace llvm {
-  class MCStreamer;
-  class TargetLoweringObjectFile;
-  class MipsSubtarget;
-
-  class MipsReginfo {
-    void anchor();
-  public:
-    MipsReginfo() {}
-
-    void emitMipsReginfoSectionCG(MCStreamer &OS,
-        const TargetLoweringObjectFile &TLOF,
-        const MipsSubtarget &MST) const;
-  };
-
-} // namespace llvm
-
-#endif
-
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 5e90bbc..fb6aff2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -11,57 +11,368 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMCTargetDesc.h"
+#include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 
 using namespace llvm;
 
-static cl::opt<bool> PrintHackDirectives("print-hack-directives",
-                                         cl::init(false), cl::Hidden);
-
-// pin vtable to this file
+// Pin vtable to this file.
 void MipsTargetStreamer::anchor() {}
 
-MipsTargetAsmStreamer::MipsTargetAsmStreamer(formatted_raw_ostream &OS)
-    : OS(OS) {}
+MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
+                                             formatted_raw_ostream &OS)
+    : MipsTargetStreamer(S), OS(OS) {}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() {
+  OS << "\t.set\tmicromips\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() {
+  OS << "\t.set\tnomicromips\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips16() {
+  OS << "\t.set\tmips16\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() {
+  OS << "\t.set\tnomips16\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetReorder() {
+  OS << "\t.set\treorder\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() {
+  OS << "\t.set\tnoreorder\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMacro() {
+  OS << "\t.set\tmacro\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() {
+  OS << "\t.set\tnomacro\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetAt() {
+  OS << "\t.set\tat\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
+  OS << "\t.set\tnoat\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) {
+  OS << "\t.end\t" << Name << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+  OS << "\t.ent\t" << Symbol.getName() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveAbiCalls() { OS << "\t.abicalls\n"; }
+void MipsTargetAsmStreamer::emitDirectiveOptionPic0() {
+  OS << "\t.option\tpic0\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic2() {
+  OS << "\t.option\tpic2\n";
+}
+
+void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                      unsigned ReturnReg) {
+  OS << "\t.frame\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(StackReg)).lower() << ","
+     << StackSize << ",$"
+     << StringRef(MipsInstPrinter::getRegisterName(ReturnReg)).lower() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
+  OS << "\t.set\tmips32r2\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64() {
+  OS << "\t.set\tmips64\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
+  OS << "\t.set\tmips64r2\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetDsp() {
+  OS << "\t.set\tdsp\n";
+}
+// Print a 32 bit hex number with all numbers.
+static void printHex32(unsigned Value, raw_ostream &OS) {
+  OS << "0x";
+  for (int i = 7; i >= 0; i--)
+    OS.write_hex((Value & (0xF << (i*4))) >> (i*4));
+}
+
+void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask,
+                                     int CPUTopSavedRegOff) {
+  OS << "\t.mask \t";
+  printHex32(CPUBitmask, OS);
+  OS << ',' << CPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
+                                      int FPUTopSavedRegOff) {
+  OS << "\t.fmask\t";
+  printHex32(FPUBitmask, OS);
+  OS << "," << FPUTopSavedRegOff << '\n';
+}
+
+// This part is for ELF object output.
+MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
+                                             const MCSubtargetInfo &STI)
+    : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  uint64_t Features = STI.getFeatureBits();
+  Triple T(STI.getTargetTriple());
+  Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() ==  Reloc::PIC_)
+            ? true
+            : false;
+
+  // Update e_header flags
+  unsigned EFlags = 0;
+
+  // Architecture
+  if (Features & Mips::FeatureMips64r2)
+    EFlags |= ELF::EF_MIPS_ARCH_64R2;
+  else if (Features & Mips::FeatureMips64)
+    EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Features & Mips::FeatureMips4)
+    EFlags |= ELF::EF_MIPS_ARCH_4;
+  else if (Features & Mips::FeatureMips32r2)
+    EFlags |= ELF::EF_MIPS_ARCH_32R2;
+  else if (Features & Mips::FeatureMips32)
+    EFlags |= ELF::EF_MIPS_ARCH_32;
+
+  if (T.isArch64Bit()) {
+    if (Features & Mips::FeatureN32)
+      EFlags |= ELF::EF_MIPS_ABI2;
+    else if (Features & Mips::FeatureO32) {
+      EFlags |= ELF::EF_MIPS_ABI_O32;
+      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
+    }
+    // No need to set any bit for N64 which is the default ABI at the moment
+    // for 64-bit Mips architectures.
+  } else {
+    if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64)
+      EFlags |= ELF::EF_MIPS_32BITMODE;
+
+    // ABI
+    EFlags |= ELF::EF_MIPS_ABI_O32;
+  }
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
 
-void MipsTargetAsmStreamer::emitMipsHackELFFlags(unsigned Flags) {
-  if (!PrintHackDirectives)
+void MipsTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+  if (!isMicroMipsEnabled())
     return;
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Symbol);
+  uint8_t Type = MCELF::GetType(Data);
+  if (Type != ELF::STT_FUNC)
+    return;
+
+  // The "other" values are stored in the last 6 bits of the second byte
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(Data, ELF::STO_MIPS_MICROMIPS >> 2);
+}
+
+void MipsTargetELFStreamer::finish() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCContext &Context = MCA.getContext();
+  MCStreamer &OS = getStreamer();
+  Triple T(STI.getTargetTriple());
+  uint64_t Features = STI.getFeatureBits();
+
+  if (T.isArch64Bit() && (Features & Mips::FeatureN64)) {
+    const MCSectionELF *Sec = Context.getELFSection(
+        ".MIPS.options", ELF::SHT_MIPS_OPTIONS,
+        ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, SectionKind::getMetadata());
+    OS.SwitchSection(Sec);
 
-  OS << "\t.mips_hack_elf_flags 0x";
-  OS.write_hex(Flags);
-  OS << '\n';
+    OS.EmitIntValue(1, 1); // kind
+    OS.EmitIntValue(40, 1); // size
+    OS.EmitIntValue(0, 2); // section
+    OS.EmitIntValue(0, 4); // info
+    OS.EmitIntValue(0, 4); // ri_gprmask
+    OS.EmitIntValue(0, 4); // pad
+    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
+    OS.EmitIntValue(0, 8); // ri_gp_value
+  } else {
+    const MCSectionELF *Sec =
+        Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC,
+                              SectionKind::getMetadata());
+    OS.SwitchSection(Sec);
+
+    OS.EmitIntValue(0, 4); // ri_gprmask
+    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
+    OS.EmitIntValue(0, 4); // ri_gp_value
+  }
 }
-void MipsTargetAsmStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
-  if (!PrintHackDirectives)
+
+void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol,
+                                           const MCExpr *Value) {
+  // If on rhs is micromips symbol then mark Symbol as microMips.
+  if (Value->getKind() != MCExpr::SymbolRef)
+    return;
+  const MCSymbol &RhsSym =
+    static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym);
+  uint8_t Type = MCELF::GetType(Data);
+  if ((Type != ELF::STT_FUNC)
+      || !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2)))
     return;
 
-  OS << "\t.mips_hack_stocg ";
-  OS << Sym->getName();
-  OS << ", ";
-  OS << Val;
-  OS << '\n';
+  MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol);
+  // The "other" values are stored in the last 6 bits of the second byte.
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(SymbolData, ELF::STO_MIPS_MICROMIPS >> 2);
 }
 
 MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
-  return static_cast<MCELFStreamer &>(*Streamer);
+  return static_cast<MCELFStreamer &>(Streamer);
 }
 
-void MipsTargetELFStreamer::emitMipsHackELFFlags(unsigned Flags) {
+void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
+  MicroMipsEnabled = true;
+
   MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_MICROMIPS;
   MCA.setELFHeaderEFlags(Flags);
 }
 
-// Set a symbol's STO flags
-void MipsTargetELFStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
-  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Sym);
-  // The "other" values are stored in the last 6 bits of the second byte
-  // The traditional defines for STO values assume the full byte and thus
-  // the shift to pack it.
-  MCELF::setOther(Data, Val >> 2);
+void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
+  MicroMipsEnabled = false;
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips16() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMips16() {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetReorder() {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NOREORDER;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMacro() {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMacro() {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetAt() {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoAt() {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  // This option overrides other PIC options like -KPIC.
+  Pic = false;
+  Flags &= ~ELF::EF_MIPS_PIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic2() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Pic = true;
+  // NOTE: We are following the GAS behaviour here which means the directive
+  // 'pic2' also sets the CPIC bit in the ELF header. This is different from
+  // what is stated in the SYSV ABI which consider the bits EF_MIPS_PIC and
+  // EF_MIPS_CPIC to be mutually exclusive.
+  Flags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                      unsigned ReturnReg) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitMask(unsigned CPUBitmask,
+                                     int CPUTopSavedRegOff) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
+                                      int FPUTopSavedRegOff) {
+  // FIXME: implement.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips32R2() {
+  // No action required for ELF output.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips64() {
+  // No action required for ELF output.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips64R2() {
+  // No action required for ELF output.
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetDsp() {
+  // No action required for ELF output.
 }
diff --git a/lib/Target/Mips/MSA.txt b/lib/Target/Mips/MSA.txt
index d1c4193..113375f 100644
--- a/lib/Target/Mips/MSA.txt
+++ b/lib/Target/Mips/MSA.txt
@@ -62,11 +62,16 @@ binsri.[bhwd],  binsli.[bhwd]:
 
 bmnz.v, bmz.v, bsel.v:
         These three operations differ only in the operand that is tied to the
-        result.
+        result and the order of the operands.
         It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
         the same operation and will be emitted instead.
         In future, the compiler may choose between these three instructions
         according to register allocation.
+        These three operations can be very confusing so here is a mapping
+        between the instructions and the vselect node in one place:
+                bmz.v  wd, ws, wt/i8 -> (vselect wt/i8, wd, ws)
+                bmnz.v wd, ws, wt/i8 -> (vselect wt/i8, ws, wd)
+                bsel.v wd, ws, wt/i8 -> (vselect wd, wt/i8, ws)
 
 bmnzi.b, bmzi.b:
         Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
new file mode 100644
index 0000000..91d447a
--- /dev/null
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -0,0 +1,148 @@
+let isCodeGenOnly = 1, Predicates = [InMicroMips] in {
+def FADD_S_MM : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+                ADDS_FM_MM<0, 0x30>;
+def FDIV_S_MM : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+                ADDS_FM_MM<0, 0xf0>;
+def FMUL_S_MM : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+                ADDS_FM_MM<0, 0xb0>;
+def FSUB_S_MM : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+                ADDS_FM_MM<0, 0x70>;
+
+def FADD_MM  : MMRel, ADDS_FT<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>,
+               ADDS_FM_MM<1, 0x30>;
+def FDIV_MM  : MMRel, ADDS_FT<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>,
+               ADDS_FM_MM<1, 0xf0>;
+def FMUL_MM  : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
+               ADDS_FM_MM<1, 0xb0>;
+def FSUB_MM  : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
+               ADDS_FM_MM<1, 0x70>;
+
+def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM_MM<0x27>;
+def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>,
+              LW_FM_MM<0x26>;
+def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM_MM<0x2f>;
+def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>,
+              LW_FM_MM<0x2e>;
+def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
+               LWXC1_FM_MM<0x48>;
+def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
+               SWXC1_FM_MM<0x88>;
+def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
+               LWXC1_FM_MM<0x148>;
+def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
+               SWXC1_FM_MM<0x188>;
+
+def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
+                  CEQS_FM_MM<0>;
+def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
+                  CEQS_FM_MM<1>;
+
+def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>,
+              BC1F_FM_MM<0x1c>;
+def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>,
+              BC1F_FM_MM<0x1d>;
+
+def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+                   ROUND_W_FM_MM<0, 0x6c>;
+def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                   ROUND_W_FM_MM<0, 0x24>;
+def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+                   ROUND_W_FM_MM<0, 0x2c>;
+def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+                   ROUND_W_FM_MM<0, 0xec>;
+def TRUNC_W_S_MM : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+                   ROUND_W_FM_MM<0, 0xac>;
+def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+                                fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+
+def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
+                 ROUND_W_FM_MM<1, 0x6c>;
+def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                 ROUND_W_FM_MM<1, 0x24>;
+def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
+                 ROUND_W_FM_MM<1, 0x2c>;
+def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+                 ROUND_W_FM_MM<1, 0xec>;
+def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
+                 ROUND_W_FM_MM<1, 0xac>;
+
+def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
+                              fsqrt>, ROUND_W_FM_MM<1, 0x28>;
+
+def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                   ROUND_W_FM_MM<0, 0x4>;
+def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+                   ROUND_W_FM_MM<1, 0x4>;
+
+def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+                ABS_FM_MM<0, 0xd>;
+def FMOV_S_MM : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+                ABS_FM_MM<0, 0x1>;
+def FNEG_S_MM : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+                ABS_FM_MM<0, 0x2d>;
+def CVT_D_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                 ABS_FM_MM<0, 0x4d>;
+def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                   ABS_FM_MM<1, 0x4d>;
+def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                   ABS_FM_MM<0, 0x6d>;
+def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+                 ABS_FM_MM<1, 0x6d>;
+
+def FABS_MM : MMRel, ABSS_FT<"abs.d", AFGR64Opnd, AFGR64Opnd, II_ABS, fabs>,
+              ABS_FM_MM<1, 0xd>;
+def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
+              ABS_FM_MM<1, 0x2d>;
+
+def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+                  ABS_FM_MM<1, 0x1>, Requires<[NotFP64bit, HasStdEnc]>;
+
+def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
+                                     II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>;
+def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
+                                     II_MOVN_S>, CMov_I_F_FM_MM<0x38, 0>;
+def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                       II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>;
+def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                       II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>;
+
+def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
+                                   MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>;
+def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
+                                   MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>;
+def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                     MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 1>;
+def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                     MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>;
+
+def CFC1_MM : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>,
+              MFC1_FM_MM<0x40>;
+def CTC1_MM : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>,
+              MFC1_FM_MM<0x60>;
+def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
+                             II_MFC1, bitconvert>, MFC1_FM_MM<0x80>;
+def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
+                             II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
+def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
+               MFC1_FM_MM<3>;
+def MTHC1_MM : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
+               MFC1_FM_MM<7>;
+
+def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+                MADDS_FM_MM<0x1>;
+def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+                MADDS_FM_MM<0x21>;
+def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+                 MADDS_FM_MM<0x2>;
+def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+                 MADDS_FM_MM<0x22>;
+
+def MADD_D32_MM  : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+                   MADDS_FM_MM<0x9>;
+def MSUB_D32_MM  : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+                   MADDS_FM_MM<0x29>;
+def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+                   MADDS_FM_MM<0xa>;
+def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+                   MADDS_FM_MM<0x2a>;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index c12a32e..15b951d 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -1,3 +1,81 @@
+//===----------------------------------------------------------------------===//
+// MicroMIPS Base Classes
+//===----------------------------------------------------------------------===//
+
+//
+// Base class for MicroMips instructions.
+// This class does not depend on the instruction size.
+//
+class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin, Format f> : Instruction
+{
+  let Namespace = "Mips";
+  let DecoderNamespace = "MicroMips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  let Predicates = [InMicroMips];
+
+  Format Form = f;
+}
+
+//
+// Base class for MicroMIPS 16-bit instructions.
+//
+class MicroMipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin, Format f> :
+  MicroMipsInstBase<outs, ins, asmstr, pattern, itin, f>
+{
+  let Size = 2;
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+  bits<6> Opcode = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 16-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class MOVE_FM_MM16<bits<6> funct> {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = funct;
+  let Inst{9-5}   = rd;
+  let Inst{4-0}   = rs;
+}
+
+class JALR_FM_MM16<bits<5> op> {
+  bits<5> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = op;
+  let Inst{4-0}   = rs;
+}
+
+class MFHILO_FM_MM16<bits<5> funct> {
+  bits<5> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = funct;
+  let Inst{4-0}   = rd;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 32-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
 class MMArch {
   string Arch = "micromips";
   list<dag> Pattern = [];
@@ -226,7 +304,7 @@ class JR_FM_MM<bits<8> funct> : MMArch {
   let Inst{5-0}   = 0x3c;
 }
 
-class JALR_FM_MM<bits<10> funct> : MMArch {
+class JALR_FM_MM<bits<10> funct> {
   bits<5> rs;
   bits<5> rd;
 
@@ -276,6 +354,67 @@ class BGEZAL_FM_MM<bits<5> funct> : MMArch {
   let Inst{15-0}  = offset;
 }
 
+class SYNC_FM_MM : MMArch {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = stype;
+  let Inst{15-6}  = 0x1ad;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BRK_FM_MM : MMArch {
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = 0x07;
+}
+
+class SYS_FM_MM : MMArch {
+  bits<10> code_;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x22d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class WAIT_FM_MM {
+  bits<10> code_;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x24d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class ER_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = 0x00;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EI_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+  bits<5> rt;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
 class TEQ_FM_MM<bits<6> funct> : MMArch {
   bits<5> rs;
   bits<5> rt;
@@ -302,3 +441,183 @@ class TEQI_FM_MM<bits<5> funct> : MMArch {
   let Inst{20-16} = rs;
   let Inst{15-0}  = imm16;
 }
+
+class LL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10}    = 0;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+
+  list<dag> Pattern = [];
+}
+
+class LWXC1_FM_MM<bits<9> funct> : MMArch {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class SWXC1_FM_MM<bits<9> funct> : MMArch {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = fs;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class CEQS_FM_MM<bits<2> fmt> : MMArch {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-13} = 0x0;  // cc
+  let Inst{12}    = 0;
+  let Inst{11-10} = fmt;
+  let Inst{9-6}   = cond;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BC1F_FM_MM<bits<5> tf> : MMArch {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = tf;
+  let Inst{20-18} = 0x0; // cc
+  let Inst{17-16} = 0x0;
+  let Inst{15-0}  = offset;
+}
+
+class ROUND_W_FM_MM<bits<1> fmt, bits<8> funct> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class ABS_FM_MM<bits<2> fmt, bits<7> funct> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class CMov_F_F_FM_MM<bits<9> func, bits<2> fmt> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15-13} = 0x0; //cc
+  let Inst{12-11} = 0x0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = func;
+}
+
+class CMov_I_F_FM_MM<bits<8> funct, bits<2> fmt> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+}
+
+class MFC1_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MADDS_FM_MM<bits<6> funct>: MMArch {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+  bits<5> fr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-6}  = fr;
+  let Inst{5-0}   = funct;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index d9507fa..3f13e83 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -45,6 +45,64 @@ class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
   let DecoderMethod = "DecodeMemMMImm12";
 }
 
+class LLBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
+class SCBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+             InstrItinClass Itin = NoItinerary> :
+  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr))], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
+      MicroMipsInst16<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"),
+  [], II_MFHI_MFLO, FrmR> {
+  let Uses = [UseReg];
+  let hasSideEffects = 0;
+}
+
+class MoveMM16<string opstr, RegisterOperand RO, bit isComm = 0,
+               InstrItinClass Itin = NoItinerary> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs),
+                  !strconcat(opstr, "\t$rd, $rs"), [], Itin, FrmR> {
+  let isCommutable = isComm;
+  let isReMaterializable = 1;
+}
+
+// 16-bit Jump and Link (Call)
+class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [(MipsJmpLink RO:$rs)], IIBranch, FrmR> {
+  let isCall = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
+def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
+def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>;
+
+class WaitMM<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
+         NoItinerary, FrmOther, opstr>;
+
 let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   /// Arithmetic Instructions (ALU Immediate)
   def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd>,
@@ -63,6 +121,9 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                  ADDI_FM_MM<0x1c>;
   def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM_MM;
 
+  def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
+                     LW_FM_MM<0xc>;
+
   /// Arithmetic Instructions (3-Operand, R-Type)
   def ADDu_MM  : MMRel, ArithLogicR<"addu", GPR32Opnd>, ADD_FM_MM<0, 0x150>;
   def SUBu_MM  : MMRel, ArithLogicR<"subu", GPR32Opnd>, ADD_FM_MM<0, 0x1d0>;
@@ -72,38 +133,38 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def SLT_MM   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
   def SLTu_MM  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
                  ADD_FM_MM<0, 0x390>;
-  def AND_MM   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, IIAlu, and>,
+  def AND_MM   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
                  ADD_FM_MM<0, 0x250>;
-  def OR_MM    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, IIAlu, or>,
+  def OR_MM    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
                  ADD_FM_MM<0, 0x290>;
-  def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IIAlu, xor>,
+  def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
                  ADD_FM_MM<0, 0x310>;
   def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
-  def MULT_MM  : MMRel, Mult<"mult", IIImul, GPR32Opnd, [HI0, LO0]>,
+  def MULT_MM  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x22c>;
-  def MULTu_MM : MMRel, Mult<"multu", IIImul, GPR32Opnd, [HI0, LO0]>,
+  def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x26c>;
-  def SDIV_MM  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+  def SDIV_MM  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x2ac>;
-  def UDIV_MM  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+  def UDIV_MM  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x2ec>;
 
   /// Shift Instructions
-  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd>,
+  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
                  SRA_FM_MM<0, 0>;
-  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd>,
+  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL>,
                  SRA_FM_MM<0x40, 0>;
-  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd>,
+  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA>,
                  SRA_FM_MM<0x80, 0>;
-  def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd>,
+  def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV>,
                  SRLV_FM_MM<0x10, 0>;
-  def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", GPR32Opnd>,
+  def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV>,
                  SRLV_FM_MM<0x50, 0>;
-  def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd>,
+  def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
                  SRLV_FM_MM<0x90, 0>;
-  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd>,
+  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
                  SRA_FM_MM<0xc0, 0>;
-  def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd>,
+  def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
                  SRLV_FM_MM<0xd0, 0>;
 
   /// Load and Store Instructions - aligned
@@ -118,6 +179,8 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
     def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
   }
 
+  def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>;
+
   /// Load and Store Instructions - unaligned
   def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12>,
                LWL_FM_MM<0x0>;
@@ -133,9 +196,9 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                   NoItinerary>, ADD_FM_MM<0, 0x58>;
   def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
                   NoItinerary>, ADD_FM_MM<0, 0x18>;
-  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIAlu>,
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT>,
                   CMov_F_I_FM_MM<0x25>;
-  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIAlu>,
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF>,
                   CMov_F_I_FM_MM<0x5>;
 
   /// Move to/from HI/LO
@@ -149,18 +212,18 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
                 MFLO_FM_MM<0x075>;
 
   /// Multiply Add/Sub Instructions
-  def MADD_MM  : MMRel, MArithR<"madd", 1>, MULT_FM_MM<0x32c>;
-  def MADDU_MM : MMRel, MArithR<"maddu", 1>, MULT_FM_MM<0x36c>;
-  def MSUB_MM  : MMRel, MArithR<"msub">, MULT_FM_MM<0x3ac>;
-  def MSUBU_MM : MMRel, MArithR<"msubu">, MULT_FM_MM<0x3ec>;
+  def MADD_MM  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>;
+  def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>;
+  def MSUB_MM  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>;
+  def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
 
   /// Count Leading
   def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>;
   def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>;
 
   /// Sign Ext In Register Instructions.
-  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM_MM<0x0ac>;
-  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM_MM<0x0ec>;
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>, SEB_FM_MM<0x0ac>;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM_MM<0x0ec>;
 
   /// Word Swap Bytes Within Halfwords
   def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>;
@@ -175,13 +238,9 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
     def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
                       J_FM_MM<0x35>;
     def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
-    def TAILCALL_MM : MMRel, JumpFJ<calltarget_mm, "j", MipsTailCall, imm,
-                                    "tcall">, J_FM_MM<0x3d>, IsTailCall;
   }
   def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
-  def JALR_MM : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
-  def TAILCALL_R_MM : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>,
-                      JR_FM_MM<0x3c>, IsTailCall;
+  def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
   def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>;
 
   /// Branch Instructions
@@ -202,6 +261,16 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
                   BGEZAL_FM_MM<0x01>;
 
+  /// Control Instructions
+  def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
+  def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM;
+  def SYSCALL_MM : MMRel, SYS_FT<"syscall">, SYS_FM_MM;
+  def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
+  def ERET_MM    : MMRel, ER_FT<"eret">, ER_FM_MM<0x3cd>;
+  def DERET_MM   : MMRel, ER_FT<"deret">, ER_FM_MM<0x38d>;
+  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM_MM<0x15d>;
+  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM_MM<0x11d>;
+
   /// Trap Instructions
   def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
   def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM_MM<0x08>;
@@ -216,4 +285,16 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM_MM<0x08>;
   def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM_MM<0x0a>;
   def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM_MM<0x0c>;
+
+  /// Load-linked, Store-conditional
+  def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
+  def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMips instruction aliases
+//===----------------------------------------------------------------------===//
+
+let Predicates = [InMicroMips] in {
+  def : InstAlias<"wait", (WAIT_MM 0x0), 1>;
 }
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index e796deb..d512d65 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -23,6 +23,7 @@ namespace llvm {
   class FunctionPass;
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+  FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
   FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index b8e3f39..10a4699 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -63,10 +63,13 @@ def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
                                 [FeatureMips32, FeatureSEInReg, FeatureSwap,
                                  FeatureFPIdx]>;
+def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
+                                "Mips4", "MIPS IV ISA Support",
+                                [FeatureGP64Bit, FeatureFP64Bit,
+                                 FeatureCondMov]>;
 def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
                                 "Mips64", "Mips64 ISA Support",
-                                [FeatureGP64Bit, FeatureFP64Bit,
-                                 FeatureMips32, FeatureFPIdx]>;
+                                [FeatureMips4, FeatureMips32, FeatureFPIdx]>;
 def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
                                 "Mips64r2", "Mips64r2 ISA Support",
                                 [FeatureMips64, FeatureMips32r2]>;
@@ -83,6 +86,10 @@ def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
 def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
                                          "microMips mode">;
 
+def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
+                                "true", "Octeon cnMIPS Support",
+                                [FeatureMips64r2]>;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
@@ -90,16 +97,13 @@ def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
-def : Proc<"mips32", [FeatureMips32]>;
-def : Proc<"mips32r2", [FeatureMips32r2]>;
-def : Proc<"mips64", [FeatureMips64]>;
-def : Proc<"mips64r2", [FeatureMips64r2]>;
-def : Proc<"mips16", [FeatureMips16]>;
-
-def MipsAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
+def : Proc<"mips32", [FeatureMips32, FeatureO32]>;
+def : Proc<"mips32r2", [FeatureMips32r2, FeatureO32]>;
+def : Proc<"mips4", [FeatureMips4, FeatureN64]>;
+def : Proc<"mips64", [FeatureMips64, FeatureN64]>;
+def : Proc<"mips64r2", [FeatureMips64r2, FeatureN64]>;
+def : Proc<"mips16", [FeatureMips16, FeatureO32]>;
+def : Proc<"octeon", [FeatureMips64r2, FeatureN64, FeatureCnMips]>;
 
 def MipsAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
@@ -116,6 +120,5 @@ def MipsAsmParserVariant : AsmParserVariant {
 def Mips : Target {
   let InstructionSet = MipsInstrInfo;
   let AssemblyParsers = [MipsAsmParser];
-  let AssemblyWriters = [MipsAsmWriter];
   let AssemblyParserVariants = [MipsAsmParserVariant];
 }
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 6655ff9..028b049 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips16InstrInfo.h"
 #include "MipsInstrInfo.h"
+#include "MipsRegisterInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -47,27 +48,27 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
 
   // emit ".cfi_def_cfa_offset StackSize"
-  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
-  BuildMI(MBB, MBBI, dl,
-          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  MMI.addFrameInst(
-      MCCFIInstruction::createDefCfaOffset(AdjustSPLabel, -StackSize));
-
-  MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
-  BuildMI(MBB, MBBI, dl,
-          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
-  unsigned S2 = MRI->getDwarfRegNum(Mips::S2, true);
-  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S2, -8));
-
-  unsigned S1 = MRI->getDwarfRegNum(Mips::S1, true);
-  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S1, -12));
-
-  unsigned S0 = MRI->getDwarfRegNum(Mips::S0, true);
-  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S0, -16));
-
-  unsigned RA = MRI->getDwarfRegNum(Mips::RA, true);
-  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, RA, -4));
-
+  unsigned CFIIndex = MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+      unsigned DReg = MRI->getDwarfRegNum(Reg, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, DReg, Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
       .addReg(Mips::SP);
@@ -168,10 +169,15 @@ Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 void Mips16FrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
-  MF.getRegInfo().setPhysRegUsed(Mips::RA);
-  MF.getRegInfo().setPhysRegUsed(Mips::S0);
-  MF.getRegInfo().setPhysRegUsed(Mips::S1);
-  MF.getRegInfo().setPhysRegUsed(Mips::S2);
+  const Mips16InstrInfo &TII =
+    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RI = TII.getRegisterInfo();
+  const BitVector Reserved = RI.getReservedRegs(MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  if (SaveS2)
+    MF.getRegInfo().setPhysRegUsed(Mips::S2);
+  if (hasFP(MF))
+    MF.getRegInfo().setPhysRegUsed(Mips::S0);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 81bf18c..d321e21 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "mips16-hard-float"
 #include "Mips16HardFloat.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -167,6 +168,11 @@ static bool needsFPReturnHelper(Function &F) {
   return whichFPReturnVariant(RetType) != NoFPRet;
 }
 
+static bool needsFPReturnHelper(const FunctionType &FT) {
+  Type* RetType = FT.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
 static bool needsFPHelperFromSig(Function &F) {
   return needsFPStubFromParams(F) || needsFPReturnHelper(F);
 }
@@ -239,8 +245,8 @@ static void swapFPIntParams
 // Make sure that we know we already need a stub for this function.
 // Having called needsFPHelperFromSig
 //
-static void assureFPCallStub(Function &F, Module *M,  
-                             const MipsSubtarget &Subtarget){
+static void assureFPCallStub(Function &F, Module *M,
+                             const MipsSubtarget &Subtarget) {
   // for now we only need them for static relocation
   if (Subtarget.getRelocationModel() == Reloc::PIC_)
     return;
@@ -400,13 +406,31 @@ static bool fixupFPReturnAndCall
         Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, NULL));
         CallInst::Create(F, Params, "", &Inst );
       } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+          const Value* V = CI->getCalledValue();
+          const Type* T = 0;
+          if (V) T = V->getType();
+          const PointerType *PFT=0;
+          if (T) PFT = dyn_cast<PointerType>(T);
+          const FunctionType *FT=0;
+          if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
+          Function *F_ =  CI->getCalledFunction();
+          if (FT && needsFPReturnHelper(*FT) &&
+              !(F_ && isIntrinsicInline(F_))) {
+            Modified=true;
+            F.addFnAttr("saveS2");
+          }
+          if (F_ && !isIntrinsicInline(F_)) {
           // pic mode calls are handled by already defined
           // helper functions
-          if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
-            Function *F_ =  CI->getCalledFunction();
-            if (F_ && !isIntrinsicInline(F_) && needsFPHelperFromSig(*F_)) {
-              assureFPCallStub(*F_, M, Subtarget);
+            if (needsFPReturnHelper(*F_)) {
               Modified=true;
+              F.addFnAttr("saveS2");
+            }
+            if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
+              if (needsFPHelperFromSig(*F_)) {
+                assureFPCallStub(*F_, M, Subtarget);
+                Modified=true;
+              }
             }
           }
       }
@@ -476,8 +500,9 @@ namespace llvm {
 // declared via attributes as nomips16, we must:
 //    1) fixup all returns of float, double, single and double complex
 //       by calling a helper function before the actual return.
-//    2) generate helper functions (stubs) that can be called by mips32 functions
-//       that will move parameters passed normally passed in floating point
+//    2) generate helper functions (stubs) that can be called by mips32
+//       functions that will move parameters passed normally passed in
+//       floating point
 //       registers the soft float equivalents.
 //    3) in the case of static relocation, generate helper functions so that
 //       mips16 functions can call extern functions of unknown type (mips16 or
diff --git a/lib/Target/Mips/Mips16HardFloatInfo.cpp b/lib/Target/Mips/Mips16HardFloatInfo.cpp
new file mode 100644
index 0000000..d8b685e
--- /dev/null
+++ b/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.cpp for Mips16 Hard Float              -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of Mips16HardFloatInfo
+// namespace.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16HardFloatInfo.h"
+#include <string.h>
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+const FuncNameSignature PredefinedFuncs[] = {
+  { "__floatdidf", { NoSig, DRet } },
+  { "__floatdisf", { NoSig, FRet } },
+  { "__floatundidf", { NoSig, DRet } },
+  { "__fixsfdi", { FSig, NoFPRet } },
+  { "__fixunsdfsi", { DSig, NoFPRet } },
+  { "__fixunsdfdi", { DSig, NoFPRet } },
+  { "__fixdfdi", { DSig, NoFPRet } },
+  { "__fixunssfsi", { FSig, NoFPRet } },
+  { "__fixunssfdi", { FSig, NoFPRet } },
+  { "__floatundisf", { NoSig, FRet } },
+  { 0, { NoSig, NoFPRet } }
+};
+
+// just do a search for now. there are very few of these special cases.
+//
+extern FuncSignature const *findFuncSignature(const char *name) {
+  const char *name_;
+  int i = 0;
+  while (PredefinedFuncs[i].Name) {
+    name_ = PredefinedFuncs[i].Name;
+    if (strcmp(name, name_) == 0)
+      return &PredefinedFuncs[i].Signature;
+    i++;
+  }
+  return 0;
+}
+}
+}
diff --git a/lib/Target/Mips/Mips16HardFloatInfo.h b/lib/Target/Mips/Mips16HardFloatInfo.h
new file mode 100644
index 0000000..02444d9
--- /dev/null
+++ b/lib/Target/Mips/Mips16HardFloatInfo.h
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.h for Mips16 Hard Float              --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some data structures relevant to the implementation of
+// Mips16 hard float.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16HARDFLOATINFO_H
+#define MIPS16HARDFLOATINFO_H
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant { FRet, DRet, CFRet, CDRet, NoFPRet };
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant { FSig, FFSig, FDSig, DSig, DDSig, DFSig, NoSig };
+
+struct FuncSignature {
+  FPParamVariant ParamSig;
+  FPReturnVariant RetSig;
+};
+
+struct FuncNameSignature {
+  const char *Name;
+  FuncSignature Signature;
+};
+
+extern const FuncNameSignature PredefinedFuncs[];
+
+extern FuncSignature const *findFuncSignature(const char *name);
+}
+}
+
+#endif
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 4948f40..9e36546 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -13,8 +13,8 @@
 
 #define DEBUG_TYPE "mips-isel"
 #include "Mips16ISelDAGToDAG.h"
-#include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
@@ -24,11 +24,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 61d8bb8..5c6f302 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -12,12 +12,14 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "mips-lower"
 #include "Mips16ISelLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <string>
 
 using namespace llvm;
 
@@ -159,7 +161,9 @@ llvm::createMips16TargetLowering(MipsTargetMachine &TM) {
 }
 
 bool
-Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                    unsigned,
+                                                    bool *Fast) const {
   return false;
 }
 
@@ -429,8 +433,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   const char* Mips16HelperFunction = 0;
   bool NeedMips16Helper = false;
 
-  if (getTargetMachine().Options.UseSoftFloat &&
-      Subtarget->inMips16HardFloat()) {
+  if (Subtarget->inMips16HardFloat()) {
     //
     // currently we don't have symbols tagged with the mips16 or mips32
     // qualifier so we will assume that we don't know what kind it is.
@@ -444,7 +447,29 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
                              Find))
         LookupHelper = false;
       else {
-        Mips16IntrinsicHelperType IntrinsicFind = {S->getSymbol(), ""};
+        const char *Symbol = S->getSymbol();
+        Mips16IntrinsicHelperType IntrinsicFind = { Symbol, "" };
+        const Mips16HardFloatInfo::FuncSignature *Signature =
+            Mips16HardFloatInfo::findFuncSignature(Symbol);
+        if (!IsPICCall && (Signature && (FuncInfo->StubsNeeded.find(Symbol) ==
+                                         FuncInfo->StubsNeeded.end()))) {
+          FuncInfo->StubsNeeded[Symbol] = Signature;
+          //
+          // S2 is normally saved if the stub is for a function which
+          // returns a float or double value and is not otherwise. This is
+          // because more work is required after the function the stub
+          // is calling completes, and so the stub cannot directly return
+          // and the stub has no stack space to store the return address so
+          // S2 is used for that purpose.
+          // In order to take advantage of not saving S2, we need to also
+          // optimize the call in the stub and this requires some further
+          // functionality in MipsAsmPrinter which we don't have yet.
+          // So for now we always save S2. The optimization will be done
+          // in a follow-on patch.
+          //
+          if (1 || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet))
+            FuncInfo->setSaveS2();
+        }
         // one more look at list of intrinsics
         if (std::binary_search(Mips16IntrinsicHelper,
             array_endof(Mips16IntrinsicHelper),
@@ -524,8 +549,7 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Next, add the true and fallthrough blocks as its successors.
@@ -587,8 +611,7 @@ MachineBasicBlock *Mips16TargetLowering::emitSelT16
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Next, add the true and fallthrough blocks as its successors.
@@ -652,8 +675,7 @@ MachineBasicBlock *Mips16TargetLowering::emitSeliT16
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Next, add the true and fallthrough blocks as its successors.
@@ -747,8 +769,8 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRX16_ins(
   unsigned CC = MI->getOperand(0).getReg();
   unsigned regX = MI->getOperand(1).getReg();
   unsigned regY = MI->getOperand(2).getReg();
-  BuildMI(*BB, MI, MI->getDebugLoc(),
-		  TII->get(SltOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(SltOpc)).addReg(regX).addReg(
+      regY);
   BuildMI(*BB, MI, MI->getDebugLoc(),
           TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
   MI->eraseFromParent();   // The pseudo instruction is gone now.
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index 33b953f..618ec90 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -21,7 +21,8 @@ namespace llvm {
   public:
     explicit Mips16TargetLowering(MipsTargetMachine &TM);
 
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                               bool *Fast) const;
 
     virtual MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 000ea28..43c2fbd 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -1,3 +1,4 @@
+
 //===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -28,13 +29,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> NeverUseSaveRestore(
-  "mips16-never-use-save-restore",
-  cl::init(false),
-  cl::desc("For testing ability to adjust stack pointer "
-           "without save/restore instruction"),
-  cl::Hidden);
-
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm, Mips::Bimm16),
@@ -175,45 +169,56 @@ unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   return 0;
 }
 
+static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
+                          const std::vector<CalleeSavedInfo> &CSI, unsigned Flags=0) {
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[e-i-1].getReg();
+    switch (Reg) {
+    case Mips::RA:
+    case Mips::S0:
+    case Mips::S1:
+      MIB.addReg(Reg, Flags);
+      break;
+    case Mips::S2:
+      break;
+    default:
+      llvm_unreachable("unexpected mips16 callee saved register");
+
+    }
+  }
+}
 // Adjust SP by FrameSize bytes. Save RA, S0, S1
 void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
                     MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator I) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  if (!NeverUseSaveRestore) {
-    if (isUInt<11>(FrameSize))
-      BuildMI(MBB, I, DL, get(Mips::SaveRaF16)).addImm(FrameSize);
-    else {
-      int Base = 2040; // should create template function like isUInt that
-                       // returns largest possible n bit unsigned integer
-      int64_t Remainder = FrameSize - Base;
-      BuildMI(MBB, I, DL, get(Mips::SaveRaF16)). addImm(Base);
-      if (isInt<16>(-Remainder))
-        BuildAddiuSpImm(MBB, I, -Remainder);
-      else
-        adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
-    }
-
-  }
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  const BitVector Reserved = RI.getReservedRegs(MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  MachineInstrBuilder MIB;
+  unsigned Opc = ((FrameSize <= 128) && !SaveS2)? Mips::Save16:Mips::SaveX16;
+  MIB = BuildMI(MBB, I, DL, get(Opc));
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  addSaveRestoreRegs(MIB, CSI);
+  if (SaveS2)
+    MIB.addReg(Mips::S2);
+  if (isUInt<11>(FrameSize))
+    MIB.addImm(FrameSize);
   else {
-    //
-    // sw ra, -4[sp]
-    // sw s1, -8[sp]
-    // sw s0, -12[sp]
-
-    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
-                                       Mips::RA);
-    MIB1.addReg(Mips::SP);
-    MIB1.addImm(-4);
-    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
-                                       Mips::S1);
-    MIB2.addReg(Mips::SP);
-    MIB2.addImm(-8);
-    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
-                                       Mips::S0);
-    MIB3.addReg(Mips::SP);
-    MIB3.addImm(-12);
-    adjustStackPtrBig(SP, -FrameSize, MBB, I, Mips::V0, Mips::V1);
+    int Base = 2040; // should create template function like isUInt that
+                     // returns largest possible n bit unsigned integer
+    int64_t Remainder = FrameSize - Base;
+    MIB.addImm(Base);
+    if (isInt<16>(-Remainder))
+      BuildAddiuSpImm(MBB, I, -Remainder);
+    else
+      adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
   }
 }
 
@@ -222,42 +227,31 @@ void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize,
                                    MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  if (!NeverUseSaveRestore) {
-    if (isUInt<11>(FrameSize))
-      BuildMI(MBB, I, DL, get(Mips::RestoreRaF16)).addImm(FrameSize);
-    else {
-      int Base = 2040; // should create template function like isUInt that
-                       // returns largest possible n bit unsigned integer
-      int64_t Remainder = FrameSize - Base;
-      if (isInt<16>(Remainder))
-        BuildAddiuSpImm(MBB, I, Remainder);
-      else
-        adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
-      BuildMI(MBB, I, DL, get(Mips::RestoreRaF16)). addImm(Base);
-    }
-  }
-  else {
-    adjustStackPtrBig(SP, FrameSize, MBB, I, Mips::A0, Mips::A1);
-    // lw ra, -4[sp]
-    // lw s1, -8[sp]
-    // lw s0, -12[sp]
-    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
-                                       Mips::A0);
-    MIB1.addReg(Mips::SP);
-    MIB1.addImm(-4);
-    MachineInstrBuilder MIB0 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
-                                       Mips::RA);
-     MIB0.addReg(Mips::A0);
-    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
-                                       Mips::S1);
-    MIB2.addReg(Mips::SP);
-    MIB2.addImm(-8);
-    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
-                                       Mips::S0);
-    MIB3.addReg(Mips::SP);
-    MIB3.addImm(-12);
+  MachineFunction *MF = MBB.getParent();
+  MachineFrameInfo *MFI    = MF->getFrameInfo();
+  const BitVector Reserved = RI.getReservedRegs(*MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  MachineInstrBuilder MIB;
+  unsigned Opc = ((FrameSize <= 128) && !SaveS2)?
+    Mips::Restore16:Mips::RestoreX16;
+
+  if (!isUInt<11>(FrameSize)) {
+    unsigned Base = 2040;
+    int64_t Remainder = FrameSize - Base;
+    FrameSize = Base; // should create template function like isUInt that
+                     // returns largest possible n bit unsigned integer
+
+    if (isInt<16>(Remainder))
+      BuildAddiuSpImm(MBB, I, Remainder);
+    else
+      adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
   }
-
+  MIB = BuildMI(MBB, I, DL, get(Opc));
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  addSaveRestoreRegs(MIB, CSI, RegState::Define);
+  if (SaveS2)
+    MIB.addReg(Mips::S2, RegState::Define);
+  MIB.addImm(FrameSize);
 }
 
 // Adjust SP by Amount bytes where bytes can be up to 32bit number.
@@ -281,7 +275,7 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
   //
   //
   MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwConstant32), Reg1);
-  MIB1.addImm(Amount);
+  MIB1.addImm(Amount).addImm(-1);
   MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::MoveR3216), Reg2);
   MIB2.addReg(Mips::SP, RegState::Kill);
   MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::AdduRxRyRz16), Reg1);
@@ -393,7 +387,7 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg,
   }
   else
     Available.reset(Reg);
-  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm);
+  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm).addImm(-1);
   NewImm = 0;
   if (FrameReg == Mips::SP) {
     SpReg = Available.find_first();
@@ -417,7 +411,7 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg,
     BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(FrameReg)
       .addReg(Reg, RegState::Kill);
   if (FirstRegSaved || SecondRegSaved) {
-    II = llvm::next(II);
+    II = std::next(II);
     if (FirstRegSaved)
       copyPhysReg(MBB, II, DL, FirstRegSaved, FirstRegSavedTo, true);
     if (SecondRegSaved)
@@ -426,22 +420,6 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg,
   return Reg;
 }
 
-/// This function generates the sequence of instructions needed to get the
-/// result of adding register REG and immediate IMM.
-unsigned
-Mips16InstrInfo::basicLoadImmediate(
-  unsigned FrameReg,
-  int64_t Imm, MachineBasicBlock &MBB,
-  MachineBasicBlock::iterator II, DebugLoc DL,
-  unsigned &NewImm) const {
-  const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
-  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
-  unsigned Reg = RegInfo.createVirtualRegister(RC);
-  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm);
-  NewImm = 0;
-  return Reg;
-}
-
 unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
           Opc == Mips::Bimm16  ||
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index d9a594b..e93925c 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -88,11 +88,6 @@ public:
                          MachineBasicBlock::iterator II, DebugLoc DL,
                          unsigned &NewImm) const;
 
-  unsigned basicLoadImmediate(unsigned FrameReg,
-                              int64_t Imm, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator II, DebugLoc DL,
-                              unsigned &NewImm) const;
-
   static bool validImmediate(unsigned Opcode, unsigned Reg, int64_t Amount);
 
   static bool validSpImm8(int offset) {
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 7441c78..11166c4 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -119,7 +119,18 @@ class FJAL16_ins<bits<1> _X, string asmstr,
          !strconcat(asmstr, "\t$imm\n\tnop"),[],
          itin>  {
   let isCodeGenOnly=1;
+  let Size=6;
 }
+
+class FJALB16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins simm20:$imm),
+         !strconcat(asmstr, "\t$imm\t# branch\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+  let Size=6;
+}
+
 //
 // EXT-I instruction format
 //
@@ -289,7 +300,7 @@ class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
 
 //
 // This are pseudo formats for multiply
-// This first one can be changed to non pseudo now.
+// This first one can be changed to non-pseudo now.
 //
 // MULT
 //
@@ -734,6 +745,13 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
 def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
   let hasDelaySlot = 0;  // not true, but we add the nop for now
   let isCall=1;
+  let Defs = [RA];
+}
+
+def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 {
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isBranch=1;
+  let Defs = [RA];
 }
 
 //
@@ -769,7 +787,7 @@ def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
 // Purpose: Load Byte (Extended)
 // To load a byte from memory as a signed value.
 //
-def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad{
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, II_LB>, MayLoad{
   let isCodeGenOnly = 1;
 }
 
@@ -779,7 +797,7 @@ def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad{
 // To load a byte from memory as a unsigned value.
 //
 def LbuRxRyOffMemX16:
-  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IILoad>, MayLoad {
+  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, II_LBU>, MayLoad {
   let isCodeGenOnly = 1;
 }
 
@@ -788,7 +806,7 @@ def LbuRxRyOffMemX16:
 // Purpose: Load Halfword signed (Extended)
 // To load a halfword from memory as a signed value.
 //
-def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad{
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, II_LH>, MayLoad{
   let isCodeGenOnly = 1;
 }
 
@@ -798,7 +816,7 @@ def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad{
 // To load a halfword from memory as an unsigned value.
 //
 def LhuRxRyOffMemX16:
-  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IILoad>, MayLoad {
+  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, II_LHU>, MayLoad {
   let isCodeGenOnly = 1;
 }
 
@@ -825,7 +843,7 @@ def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
 // Purpose: Load Word (Extended)
 // To load a word from memory as a signed value.
 //
-def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, II_LW>, MayLoad{
   let isCodeGenOnly = 1;
 }
 
@@ -833,13 +851,13 @@ def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
 // Purpose: Load Word (SP-Relative, Extended)
 // To load an SP-relative word from memory as a signed value.
 //
-def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", IILoad>, MayLoad{
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", II_LW>, MayLoad{
   let Uses = [SP];
 }
 
-def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
 
-def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
 //
 // Format: MOVE r32, rz MIPS16e
 // Purpose: Move
@@ -941,26 +959,18 @@ def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
 // stack
 //
 
-// fixed form for restoring RA and the frame
-// for direct object emitter, encoding needs to be adjusted for the
-// frame size
-//
-let ra=1, s=0,s0=1,s1=1 in
-def RestoreRaF16:
-  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "restore\t$$ra,  $$s0, $$s1, $$s2, $frame_size", [], IILoad >, MayLoad {
+def Restore16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_RESTORE >, MayLoad {
   let isCodeGenOnly = 1;
-  let Defs = [S0, S1, S2, RA, SP];
+  let Defs = [SP];
   let Uses = [SP];
 }
 
-// Use Restore to increment SP since SP is not a Mip 16 register, this
-// is an easy way to do that which does not require a register.
-//
-let ra=0, s=0,s0=0,s1=0 in
-def RestoreIncSpF16:
-  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "restore\t$frame_size", [], IILoad >, MayLoad {
+
+def RestoreX16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_RESTORE >, MayLoad {
   let isCodeGenOnly = 1;
   let Defs = [SP];
   let Uses = [SP];
@@ -973,23 +983,17 @@ def RestoreIncSpF16:
 // To set up a stack frame on entry to a subroutine,
 // saving return address and static registers, and adjusting stack
 //
-let ra=1, s=1,s0=1,s1=1 in
-def SaveRaF16:
-  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "save\t$$ra, $$s0, $$s1, $$s2, $frame_size", [], IIStore >, MayStore {
+def Save16: 
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_SAVE >, MayStore {
   let isCodeGenOnly = 1;
-  let Uses = [RA, SP, S0, S1, S2];
+  let Uses = [SP];
   let Defs = [SP];
 }
 
-//
-// Use Save to decrement the SP by a constant since SP is not
-// a Mips16 register.
-//
-let ra=0, s=0,s0=0,s1=0 in
-def SaveDecSpF16:
-  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "save\t$frame_size", [], IIStore >, MayStore {
+def SaveX16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_SAVE >, MayStore {
   let isCodeGenOnly = 1;
   let Uses = [SP];
   let Defs = [SP];
@@ -1000,7 +1004,7 @@ def SaveDecSpF16:
 // To store a byte to memory.
 //
 def SbRxRyOffMemX16:
-  FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIStore>, MayStore;
+  FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, II_SB>, MayStore;
 
 //
 // Format: SEB rx MIPS16e
@@ -1138,7 +1142,7 @@ def SelTBtneZSltiu: SeliT<"btnez", "sltiu">;
 // To store a halfword to memory.
 //
 def ShRxRyOffMemX16:
-  FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, IIStore>, MayStore;
+  FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, II_SH>, MayStore;
 
 //
 // Format: SLL rx, ry, sa MIPS16e
@@ -1274,7 +1278,7 @@ def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
 // To store a word to memory.
 //
 def SwRxRyOffMemX16:
-  FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, IIStore>, MayStore;
+  FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, II_SW>, MayStore;
 
 //
 // Format: SW rx, offset(sp) MIPS16e
@@ -1282,7 +1286,7 @@ def SwRxRyOffMemX16:
 // To store an SP-relative word to memory.
 //
 def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins
-  <0b11010, "sw", IIStore>, MayStore;
+  <0b11010, "sw", II_SW>, MayStore;
 
 //
 //
@@ -1374,7 +1378,9 @@ def: Mips16Pat<
 let isCall=1, hasDelaySlot=0 in
 def JumpLinkReg16:
   FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
-              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>;
+              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch> {
+  let Defs = [RA];
+}
 
 // Mips16 pseudos
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
@@ -1890,7 +1896,7 @@ def GotPrologue16:
   MipsPseudo16<
     (outs CPU16Regs:$rh, CPU16Regs:$rl),
     (ins simm16:$immHi, simm16:$immLo),
-    ".align 2\n\tli\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+    "li\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
 
 // An operand for the CONSTPOOL_ENTRY pseudo-instruction.
 def cpinst_operand : Operand<i32> {
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 9d0f2c9..3a50ed9 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips16RegisterInfo.h"
-#include "Mips16InstrInfo.h"
 #include "Mips.h"
 #include "Mips16InstrInfo.h"
 #include "MipsAnalyzeImmediate.h"
@@ -25,9 +24,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 15ef654..7115d11 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -54,7 +54,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
 def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>;
-def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, IIArith,
+def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
                           immSExt16, add>,
               ADDI_FM<0x19>, IsAsCheapAsAMove;
 
@@ -63,86 +63,91 @@ def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
               SLTI_FM<0xa>;
 def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
               SLTI_FM<0xb>;
-def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, IILogic, immZExt16,
-                         and>,
+def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, II_AND, immZExt16, and>,
              ADDI_FM<0xc>;
-def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, IILogic, immZExt16,
-                          or>,
+def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
               ADDI_FM<0xd>;
-def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, IILogic, immZExt16,
-                          xor>,
+def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
               ADDI_FM<0xe>;
 def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64>, LUI_FM;
 }
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def DADD   : ArithLogicR<"dadd", GPR64Opnd>, ADD_FM<0, 0x2c>;
-def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, IIArith, add>,
+def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>;
+def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>,
                               ADD_FM<0, 0x2d>;
-def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, IIArith, sub>,
+def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>,
                               ADD_FM<0, 0x2f>;
+def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB, sub>, ADD_FM<0, 0x2e>;
 
 let isCodeGenOnly = 1 in {
 def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
 def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
-def AND64  : ArithLogicR<"and", GPR64Opnd, 1, IIArith, and>, ADD_FM<0, 0x24>;
-def OR64   : ArithLogicR<"or", GPR64Opnd, 1, IIArith, or>, ADD_FM<0, 0x25>;
-def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, IIArith, xor>, ADD_FM<0, 0x26>;
+def AND64  : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>;
+def OR64   : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>;
+def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>;
 def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
 }
 
 /// Shift Instructions
-def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, shl, immZExt6>,
+def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, shl, immZExt6>,
              SRA_FM<0x38, 0>;
-def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, srl, immZExt6>,
+def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, srl, immZExt6>,
              SRA_FM<0x3a, 0>;
-def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, sra, immZExt6>,
+def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, sra, immZExt6>,
              SRA_FM<0x3b, 0>;
-def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, shl>, SRLV_FM<0x14, 0>;
-def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, srl>, SRLV_FM<0x16, 0>;
-def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, sra>, SRLV_FM<0x17, 0>;
-def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd>, SRA_FM<0x3c, 0>;
-def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 0>;
-def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd>, SRA_FM<0x3f, 0>;
+def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
+             SRLV_FM<0x14, 0>;
+def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
+             SRLV_FM<0x16, 0>;
+def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
+             SRLV_FM<0x17, 0>;
+def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd, II_DSLL32>,
+             SRA_FM<0x3c, 0>;
+def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd, II_DSRL32>,
+             SRA_FM<0x3e, 0>;
+def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd, II_DSRA32>,
+             SRA_FM<0x3f, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips64r2, HasStdEnc] in {
-  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, rotr, immZExt6>,
-               SRA_FM<0x3a, 1>;
-  def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, rotr>,
+  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
+                                immZExt6>, SRA_FM<0x3a, 1>;
+  def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
                SRLV_FM<0x16, 1>;
-  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 1>;
+  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
+                SRA_FM<0x3e, 1>;
 }
 
 /// Load and Store Instructions
 ///  aligned
 let isCodeGenOnly = 1 in {
-def LB64  : Load<"lb", GPR64Opnd, sextloadi8, IILoad>, LW_FM<0x20>;
-def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, IILoad>, LW_FM<0x24>;
-def LH64  : Load<"lh", GPR64Opnd, sextloadi16, IILoad>, LW_FM<0x21>;
-def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, IILoad>, LW_FM<0x25>;
-def LW64  : Load<"lw", GPR64Opnd, sextloadi32, IILoad>, LW_FM<0x23>;
-def SB64  : Store<"sb", GPR64Opnd, truncstorei8, IIStore>, LW_FM<0x28>;
-def SH64  : Store<"sh", GPR64Opnd, truncstorei16, IIStore>, LW_FM<0x29>;
-def SW64  : Store<"sw", GPR64Opnd, truncstorei32, IIStore>, LW_FM<0x2b>;
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
 }
 
-def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, IILoad>, LW_FM<0x27>;
-def LD    : Load<"ld", GPR64Opnd, load, IILoad>, LW_FM<0x37>;
-def SD    : Store<"sd", GPR64Opnd, store, IIStore>, LW_FM<0x3f>;
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>;
+def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>;
+def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
-def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, IILoad>, LW_FM<0x22>;
-def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, IILoad>, LW_FM<0x26>;
-def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, IIStore>, LW_FM<0x2a>;
-def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, IIStore>, LW_FM<0x2e>;
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
 }
 
-def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, IILoad>, LW_FM<0x1a>;
-def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, IILoad>, LW_FM<0x1b>;
-def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, IIStore>, LW_FM<0x2c>;
-def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, IIStore>, LW_FM<0x2d>;
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>;
 
 /// Load-linked, Store-conditional
 def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>;
@@ -159,25 +164,26 @@ def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
 def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
 def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
 def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
-def TAILCALL64_R : JumpFR<"tcallr", GPR64Opnd, MipsTailCall>,
-                   MTLO_FM<8>, IsTailCall;
+def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
 }
 
 /// Multiply and Divide Instructions.
-def DMULT  : Mult<"dmult", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
+def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1c>;
-def DMULTu : Mult<"dmultu", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
+def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1d>;
 def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
-                                 IIImult>;
+                                 II_DMULT>;
 def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
-                                 IIImult>;
-def DSDIV : Div<"ddiv", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1e>;
-def DUDIV : Div<"ddivu", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1f>;
+                                 II_DMULTU>;
+def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
+            MULT_FM<0, 0x1e>;
+def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
+            MULT_FM<0, 0x1f>;
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
-                                IIIdiv, 0, 1, 1>;
+                                II_DDIV, 0, 1, 1>;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
-                                IIIdiv, 0, 1, 1>;
+                                II_DDIVU, 0, 1, 1>;
 
 let isCodeGenOnly = 1 in {
 def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>;
@@ -189,8 +195,8 @@ def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
 def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
 
 /// Sign Ext In Register Instructions.
-def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd>, SEB_FM<0x10, 0x20>;
-def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd>, SEB_FM<0x18, 0x20>;
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>;
 }
 
 /// Count Leading
@@ -216,13 +222,76 @@ def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
-                     "dsll\t$rd, $rt, 32", [], IIArith>;
+                     "dsll\t$rd, $rt, 32", [], II_DSLL>;
   def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
-                    "sll\t$rd, $rt, 0", [], IIArith>;
+                    "sll\t$rd, $rt, 0", [], II_SLL>;
   def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
-                    "sll\t$rd, $rt, 0", [], IIArith>;
+                    "sll\t$rd, $rt, 0", [], II_SLL>;
 }
+
+// Cavium Octeon cmMIPS instructions
+let Predicates = [HasCnMips] in {
+
+class Count1s<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctpop RO:$rs))], II_POP, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class ExtsCins<string opstr, SDPatternOperator Op = null_frag>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+         !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
+         [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+         NoItinerary, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+class SetCC64_R<string opstr, PatFrag cond_op> :
+  InstSE<(outs GPR64Opnd:$rd), (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set GPR64Opnd:$rd, (cond_op GPR64Opnd:$rs, GPR64Opnd:$rt))],
+         II_SEQ_SNE, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rd = $rs";
 }
+
+// Unsigned Byte Add
+let Pattern = [(set GPR64Opnd:$rd,
+                    (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
+def BADDu  : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
+                              ADD_FM<0x1c, 0x28>;
+
+// Multiply Doubleword to GPR
+let Defs = [HI0, LO0, P0, P1, P2] in
+def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
+                              ADD_FM<0x1c, 0x03>;
+
+// Extract a signed bit field /+32
+def EXTS  : ExtsCins<"exts">, EXTS_FM<0x3a>;
+def EXTS32: ExtsCins<"exts32">, EXTS_FM<0x3b>;
+
+// Clear and insert a bit field /+32
+def CINS  : ExtsCins<"cins">, EXTS_FM<0x32>;
+def CINS32: ExtsCins<"cins32">, EXTS_FM<0x33>;
+
+// Move to multiplier/product register
+def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>;
+def MTM1   : MoveToLOHI<"mtm1", GPR64Opnd, [MPL1, P0, P1, P2]>, MTMR_FM<0x0c>;
+def MTM2   : MoveToLOHI<"mtm2", GPR64Opnd, [MPL2, P0, P1, P2]>, MTMR_FM<0x0d>;
+def MTP0   : MoveToLOHI<"mtp0", GPR64Opnd, [P0]>, MTMR_FM<0x09>;
+def MTP1   : MoveToLOHI<"mtp1", GPR64Opnd, [P1]>, MTMR_FM<0x0a>;
+def MTP2   : MoveToLOHI<"mtp2", GPR64Opnd, [P2]>, MTMR_FM<0x0b>;
+
+// Count Ones in a Word/Doubleword
+def POP   : Count1s<"pop", GPR32Opnd>, POP_FM<0x2c>;
+def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>;
+
+// Set on equal/not equal
+def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>;
+def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
+}
+
+}
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -313,6 +382,43 @@ def : InstAlias<"daddu $rs, $rt, $imm",
 def : InstAlias<"dadd $rs, $rt, $imm",
                 (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
                 0>;
+def : InstAlias<"daddu $rs, $imm",
+                (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                0>;
+def : InstAlias<"dadd $rs, $imm",
+                (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                0>;
+def : InstAlias<"add $rs, $imm",
+                (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                0>;
+def : InstAlias<"addu $rs, $imm",
+                (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                0>;
+let isPseudo=1, usesCustomInserter=1, isCodeGenOnly=1 in {
+def SUBi : MipsInst<(outs GPR32Opnd: $rt), (ins GPR32Opnd: $rs, simm16: $imm),
+                    "sub\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
+def SUBiu : MipsInst<(outs GPR32Opnd: $rt), (ins GPR32Opnd: $rs, simm16: $imm),
+                    "subu\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
+def DSUBi : MipsInst<(outs GPR64Opnd: $rt), (ins GPR64Opnd: $rs, simm16_64: $imm),
+                    "ssub\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
+def DSUBiu : MipsInst<(outs GPR64Opnd: $rt), (ins GPR64Opnd: $rs, simm16_64: $imm),
+                    "ssubu\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
+}
+def : InstAlias<"dsubu $rt, $rs, $imm",
+                (DSUBiu GPR64Opnd:$rt, GPR64Opnd:$rs, simm16_64: $imm),
+                0>;
+def : InstAlias<"sub $rs, $imm",
+                (SUBi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                0>;
+def : InstAlias<"subu $rs, $imm",
+                (SUBiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                0>;
+def : InstAlias<"dsub $rs, $imm",
+                (DSUBi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                0>;
+def : InstAlias<"dsubu $rs, $imm",
+                (DSUBiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                0>;
 
 /// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 45c4398..d5df855 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "mips-asm-printer"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
@@ -27,26 +28,32 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
+#include <string>
 
 using namespace llvm;
 
 MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
-  return static_cast<MipsTargetStreamer &>(OutStreamer.getTargetStreamer());
+  return static_cast<MipsTargetStreamer &>(*OutStreamer.getTargetStreamer());
 }
 
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -55,7 +62,23 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
       .Initialize(OutContext, TM);
   MipsFI = MF.getInfo<MipsFunctionInfo>();
+  if (Subtarget->inMips16Mode())
+    for (std::map<
+             const char *,
+             const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+             it = MipsFI->StubsNeeded.begin();
+         it != MipsFI->StubsNeeded.end(); ++it) {
+      const char *Symbol = it->first;
+      const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+      if (StubsNeeded.find(Symbol) == StubsNeeded.end())
+        StubsNeeded[Symbol] = Signature;
+    }
   MCP = MF.getConstantPool();
+
+  // In NaCl, all indirect jump targets must be aligned to bundle size.
+  if (Subtarget->isTargetNaCl())
+    NaClAlignIndirectJumpTargets(MF);
+
   AsmPrinter::runOnMachineFunction(MF);
   return true;
 }
@@ -129,7 +152,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     MCInst TmpInst0;
     MCInstLowering.Lower(I, TmpInst0);
-    OutStreamer.EmitInstruction(TmpInst0);
+    EmitToStreamer(OutStreamer, TmpInst0);
   } while ((++I != E) && I->isInsideBundle()); // Delay slot check
 }
 
@@ -170,7 +193,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
 // Create a bitmask with all callee saved registers for CPU or Floating Point
 // registers. For CPU registers consider RA, GP and FP for saving if necessary.
-void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
+void MipsAsmPrinter::printSavedRegsBitmask() {
   // CPU and FPU Saved Registers Bitmasks
   unsigned CPUBitmask = 0, FPUBitmask = 0;
   int CPUTopSavedRegOff, FPUTopSavedRegOff;
@@ -218,20 +241,12 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   // CPU Regs are saved below FP Regs.
   CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
 
+  MipsTargetStreamer &TS = getTargetStreamer();
   // Print CPUBitmask
-  O << "\t.mask \t"; printHex32(CPUBitmask, O);
-  O << ',' << CPUTopSavedRegOff << '\n';
+  TS.emitMask(CPUBitmask, CPUTopSavedRegOff);
 
   // Print FPUBitmask
-  O << "\t.fmask\t"; printHex32(FPUBitmask, O);
-  O << "," << FPUTopSavedRegOff << '\n';
-}
-
-// Print a 32 bit hex number with all numbers.
-void MipsAsmPrinter::printHex32(unsigned Value, raw_ostream &O) {
-  O << "0x";
-  for (int i = 7; i >= 0; i--)
-    O.write_hex((Value & (0xF << (i*4))) >> (i*4));
+  TS.emitFMask(FPUBitmask, FPUTopSavedRegOff);
 }
 
 //===----------------------------------------------------------------------===//
@@ -246,11 +261,7 @@ void MipsAsmPrinter::emitFrameDirective() {
   unsigned returnReg = RI.getRARegister();
   unsigned stackSize = MF->getFrameInfo()->getStackSize();
 
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText("\t.frame\t$" +
-           StringRef(MipsInstPrinter::getRegisterName(stackReg)).lower() +
-           "," + Twine(stackSize) + ",$" +
-           StringRef(MipsInstPrinter::getRegisterName(returnReg)).lower());
+  getTargetStreamer().emitFrame(stackReg, stackSize, returnReg);
 }
 
 /// Emit Set directives.
@@ -265,25 +276,33 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
 }
 
 void MipsAsmPrinter::EmitFunctionEntryLabel() {
-  if (OutStreamer.hasRawTextSupport()) {
-    if (Subtarget->inMips16Mode())
-      OutStreamer.EmitRawText(StringRef("\t.set\tmips16"));
-    else
-      OutStreamer.EmitRawText(StringRef("\t.set\tnomips16"));
-    // leave out until FSF available gas has micromips changes
-    // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips"));
-    OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
-  }
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // NaCl sandboxing requires that indirect call instructions are masked.
+  // This means that function entry points should be bundle-aligned.
+  if (Subtarget->isTargetNaCl())
+    EmitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
 
   if (Subtarget->inMicroMipsMode())
-    getTargetStreamer().emitMipsHackSTOCG(CurrentFnSym,
-                                          (unsigned)ELF::STO_MIPS_MICROMIPS);
+    TS.emitDirectiveSetMicroMips();
+  // leave out until FSF available gas has micromips changes
+  //  else
+  //    TS.emitDirectiveSetNoMicroMips();
+
+  if (Subtarget->inMips16Mode())
+    TS.emitDirectiveSetMips16();
+  else
+    TS.emitDirectiveSetNoMips16();
+
+  TS.emitDirectiveEnt(*CurrentFnSym);
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// EmitFunctionBodyStart - Targets can override this to emit stuff before
 /// the first basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyStart() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
   MCInstLowering.Initialize(&MF->getContext());
 
   bool IsNakedFunction =
@@ -293,34 +312,30 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
   if (!IsNakedFunction)
     emitFrameDirective();
 
-  if (OutStreamer.hasRawTextSupport()) {
-    SmallString<128> Str;
-    raw_svector_ostream OS(Str);
-    if (!IsNakedFunction)
-      printSavedRegsBitmask(OS);
-    OutStreamer.EmitRawText(OS.str());
-    if (!Subtarget->inMips16Mode()) {
-      OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
-      OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
-      OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
-    }
+  if (!IsNakedFunction)
+    printSavedRegsBitmask();
+
+  if (!Subtarget->inMips16Mode()) {
+    TS.emitDirectiveSetNoReorder();
+    TS.emitDirectiveSetNoMacro();
+    TS.emitDirectiveSetNoAt();
   }
 }
 
 /// EmitFunctionBodyEnd - Targets can override this to emit stuff after
 /// the last basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyEnd() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
   // There are instruction for this macros, but they must
   // always be at the function end, and we can't emit and
   // break with BB logic.
-  if (OutStreamer.hasRawTextSupport()) {
-    if (!Subtarget->inMips16Mode()) {
-      OutStreamer.EmitRawText(StringRef("\t.set\tat"));
-      OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
-      OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
-    }
-    OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
+  if (!Subtarget->inMips16Mode()) {
+    TS.emitDirectiveSetAt();
+    TS.emitDirectiveSetMacro();
+    TS.emitDirectiveSetReorder();
   }
+  TS.emitDirectiveEnd(CurrentFnSym->getName());
   // Make sure to terminate any constant pools that were at the end
   // of the function.
   if (!InConstantPool)
@@ -495,6 +510,7 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                   raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   bool closeP = false;
 
@@ -542,17 +558,8 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       break;
     }
 
-    case MachineOperand::MO_ExternalSymbol:
-      O << *GetExternalSymbolSymbol(MO.getSymbolName());
-      break;
-
-    case MachineOperand::MO_JumpTableIndex:
-      O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
-        << '_' << MO.getIndex();
-      break;
-
     case MachineOperand::MO_ConstantPoolIndex:
-      O << MAI->getPrivateGlobalPrefix() << "CPI"
+      O << DL->getPrivateGlobalPrefix() << "CPI"
         << getFunctionNumber() << "_" << MO.getIndex();
       if (MO.getOffset())
         O << "+" << MO.getOffset();
@@ -612,86 +619,299 @@ printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
 }
 
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  // FIXME: Use SwitchSection.
-
   // TODO: Need to add -mabicalls and -mno-abicalls flags.
   // Currently we assume that -mabicalls is the default.
-  if (OutStreamer.hasRawTextSupport()) {
-    OutStreamer.EmitRawText(StringRef("\t.abicalls"));
-    Reloc::Model RM = Subtarget->getRelocationModel();
-    if (RM == Reloc::Static && !Subtarget->hasMips64())
-      OutStreamer.EmitRawText(StringRef("\t.option\tpic0"));
-  }
+  getTargetStreamer().emitDirectiveAbiCalls();
+  Reloc::Model RM = Subtarget->getRelocationModel();
+  if (RM == Reloc::Static && !Subtarget->hasMips64())
+    getTargetStreamer().emitDirectiveOptionPic0();
 
   // Tell the assembler which ABI we are using
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText("\t.section .mdebug." +
-                            Twine(getCurrentABIString()));
+  std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
+  OutStreamer.SwitchSection(OutContext.getELFSection(
+      SectionName, ELF::SHT_PROGBITS, 0, SectionKind::getDataRel()));
 
   // TODO: handle O64 ABI
-  if (OutStreamer.hasRawTextSupport()) {
-    if (Subtarget->isABI_EABI()) {
-      if (Subtarget->isGP32bit())
-        OutStreamer.EmitRawText(StringRef("\t.section .gcc_compiled_long32"));
-      else
-        OutStreamer.EmitRawText(StringRef("\t.section .gcc_compiled_long64"));
-    }
-  }
 
-  // return to previous section
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText(StringRef("\t.previous"));
+  if (Subtarget->isABI_EABI()) {
+    if (Subtarget->isGP32bit())
+      OutStreamer.SwitchSection(
+          OutContext.getELFSection(".gcc_compiled_long32", ELF::SHT_PROGBITS, 0,
+                                   SectionKind::getDataRel()));
+    else
+      OutStreamer.SwitchSection(
+          OutContext.getELFSection(".gcc_compiled_long64", ELF::SHT_PROGBITS, 0,
+                                   SectionKind::getDataRel()));
+  }
+}
 
+void MipsAsmPrinter::EmitJal(MCSymbol *Symbol) {
+  MCInst I;
+  I.setOpcode(Mips::JAL);
+  I.addOperand(
+      MCOperand::CreateExpr(MCSymbolRefExpr::Create(Symbol, OutContext)));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
 }
 
-static void emitELFHeaderFlagsCG(MipsTargetStreamer &TargetStreamer,
-                                 const MipsSubtarget &Subtarget) {
-  // Update e_header flags
-  unsigned EFlags = 0;
+void MipsAsmPrinter::EmitInstrReg(unsigned Opcode, unsigned Reg) {
+  MCInst I;
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::CreateReg(Reg));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
 
-  // TODO: Need to add -mabicalls and -mno-abicalls flags.
-  // Currently we assume that -mabicalls is the default.
-  EFlags |= ELF::EF_MIPS_CPIC;
+void MipsAsmPrinter::EmitInstrRegReg(unsigned Opcode, unsigned Reg1,
+                                     unsigned Reg2) {
+  MCInst I;
+  //
+  // Because of the current td files for Mips32, the operands for MTC1
+  // appear backwards from their normal assembly order. It's not a trivial
+  // change to fix this in the td file so we adjust for it here.
+  //
+  if (Opcode == Mips::MTC1) {
+    unsigned Temp = Reg1;
+    Reg1 = Reg2;
+    Reg2 = Temp;
+  }
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::CreateReg(Reg1));
+  I.addOperand(MCOperand::CreateReg(Reg2));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
 
-  if (Subtarget.inMips16Mode())
-    EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
-  else
-    EFlags |= ELF::EF_MIPS_NOREORDER;
-
-  // Architecture
-  if (Subtarget.hasMips64r2())
-    EFlags |= ELF::EF_MIPS_ARCH_64R2;
-  else if (Subtarget.hasMips64())
-    EFlags |= ELF::EF_MIPS_ARCH_64;
-  else if (Subtarget.hasMips32r2())
-    EFlags |= ELF::EF_MIPS_ARCH_32R2;
-  else
-    EFlags |= ELF::EF_MIPS_ARCH_32;
+void MipsAsmPrinter::EmitInstrRegRegReg(unsigned Opcode, unsigned Reg1,
+                                        unsigned Reg2, unsigned Reg3) {
+  MCInst I;
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::CreateReg(Reg1));
+  I.addOperand(MCOperand::CreateReg(Reg2));
+  I.addOperand(MCOperand::CreateReg(Reg3));
+  OutStreamer.EmitInstruction(I, getSubtargetInfo());
+}
 
-  if (Subtarget.inMicroMipsMode())
-    EFlags |= ELF::EF_MIPS_MICROMIPS;
+void MipsAsmPrinter::EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1,
+                                      unsigned Reg2, unsigned FPReg1,
+                                      unsigned FPReg2, bool LE) {
+  if (!LE) {
+    unsigned temp = Reg1;
+    Reg1 = Reg2;
+    Reg2 = temp;
+  }
+  EmitInstrRegReg(MovOpc, Reg1, FPReg1);
+  EmitInstrRegReg(MovOpc, Reg2, FPReg2);
+}
 
-  // ABI
-  if (Subtarget.isABI_O32())
-    EFlags |= ELF::EF_MIPS_ABI_O32;
+void MipsAsmPrinter::EmitSwapFPIntParams(Mips16HardFloatInfo::FPParamVariant PV,
+                                         bool LE, bool ToFP) {
+  using namespace Mips16HardFloatInfo;
+  unsigned MovOpc = ToFP ? Mips::MTC1 : Mips::MFC1;
+  switch (PV) {
+  case FSig:
+    EmitInstrRegReg(MovOpc, Mips::A0, Mips::F12);
+    break;
+  case FFSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F14, LE);
+    break;
+  case FDSig:
+    EmitInstrRegReg(MovOpc, Mips::A0, Mips::F12);
+    EmitMovFPIntPair(MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    break;
+  case DSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    break;
+  case DDSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitMovFPIntPair(MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    break;
+  case DFSig:
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitInstrRegReg(MovOpc, Mips::A2, Mips::F14);
+    break;
+  case NoSig:
+    return;
+  }
+}
 
-  // Relocation Model
-  Reloc::Model RM = Subtarget.getRelocationModel();
-  if (RM == Reloc::PIC_ || RM == Reloc::Default)
-    EFlags |= ELF::EF_MIPS_PIC;
-  else if (RM == Reloc::Static)
-    ; // Do nothing for Reloc::Static
-  else
-    llvm_unreachable("Unsupported relocation model for e_flags");
+void
+MipsAsmPrinter::EmitSwapFPIntRetval(Mips16HardFloatInfo::FPReturnVariant RV,
+                                    bool LE) {
+  using namespace Mips16HardFloatInfo;
+  unsigned MovOpc = Mips::MFC1;
+  switch (RV) {
+  case FRet:
+    EmitInstrRegReg(MovOpc, Mips::V0, Mips::F0);
+    break;
+  case DRet:
+    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    break;
+  case CFRet:
+    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    break;
+  case CDRet:
+    EmitMovFPIntPair(MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    EmitMovFPIntPair(MovOpc, Mips::A0, Mips::A1, Mips::F2, Mips::F3, LE);
+    break;
+  case NoFPRet:
+    break;
+  }
+}
 
-  TargetStreamer.emitMipsHackELFFlags(EFlags);
+void MipsAsmPrinter::EmitFPCallStub(
+    const char *Symbol, const Mips16HardFloatInfo::FuncSignature *Signature) {
+  MCSymbol *MSymbol = OutContext.GetOrCreateSymbol(StringRef(Symbol));
+  using namespace Mips16HardFloatInfo;
+  bool LE = Subtarget->isLittle();
+  //
+  // .global xxxx
+  //
+  OutStreamer.EmitSymbolAttribute(MSymbol, MCSA_Global);
+  const char *RetType;
+  //
+  // make the comment field identifying the return and parameter
+  // types of the floating point stub
+  // # Stub function to call rettype xxxx (params)
+  //
+  switch (Signature->RetSig) {
+  case FRet:
+    RetType = "float";
+    break;
+  case DRet:
+    RetType = "double";
+    break;
+  case CFRet:
+    RetType = "complex";
+    break;
+  case CDRet:
+    RetType = "double complex";
+    break;
+  case NoFPRet:
+    RetType = "";
+    break;
+  }
+  const char *Parms;
+  switch (Signature->ParamSig) {
+  case FSig:
+    Parms = "float";
+    break;
+  case FFSig:
+    Parms = "float, float";
+    break;
+  case FDSig:
+    Parms = "float, double";
+    break;
+  case DSig:
+    Parms = "double";
+    break;
+  case DDSig:
+    Parms = "double, double";
+    break;
+  case DFSig:
+    Parms = "double, float";
+    break;
+  case NoSig:
+    Parms = "";
+    break;
+  }
+  OutStreamer.AddComment("\t# Stub function to call " + Twine(RetType) + " " +
+                         Twine(Symbol) + " (" + Twine(Parms) + ")");
+  //
+  // probably not necessary but we save and restore the current section state
+  //
+  OutStreamer.PushSection();
+  //
+  // .section mips16.call.fpxxxx,"ax",@progbits
+  //
+  const MCSectionELF *M = OutContext.getELFSection(
+      ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_EXECINSTR, SectionKind::getText());
+  OutStreamer.SwitchSection(M, 0);
+  //
+  // .align 2
+  //
+  OutStreamer.EmitValueToAlignment(4);
+  MipsTargetStreamer &TS = getTargetStreamer();
+  //
+  // .set nomips16
+  // .set nomicromips
+  //
+  TS.emitDirectiveSetNoMips16();
+  TS.emitDirectiveSetNoMicroMips();
+  //
+  // .ent __call_stub_fp_xxxx
+  // .type	__call_stub_fp_xxxx,@function
+  //  __call_stub_fp_xxxx:
+  //
+  std::string x = "__call_stub_fp_" + std::string(Symbol);
+  MCSymbol *Stub = OutContext.GetOrCreateSymbol(StringRef(x));
+  TS.emitDirectiveEnt(*Stub);
+  MCSymbol *MType =
+      OutContext.GetOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
+  OutStreamer.EmitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
+  OutStreamer.EmitLabel(Stub);
+  //
+  // we just handle non pic for now. these function will not be
+  // called otherwise. when the full stub generation is moved here
+  // we need to deal with pic.
+  //
+  if (Subtarget->getRelocationModel() == Reloc::PIC_)
+    llvm_unreachable("should not be here if we are compiling pic");
+  TS.emitDirectiveSetReorder();
+  //
+  // We need to add a MipsMCExpr class to MCTargetDesc to fully implement
+  // stubs without raw text but this current patch is for compiler generated
+  // functions and they all return some value.
+  // The calling sequence for non pic is different in that case and we need
+  // to implement %lo and %hi in order to handle the case of no return value
+  // See the corresponding method in Mips16HardFloat for details.
+  //
+  // mov the return address to S2.
+  // we have no stack space to store it and we are about to make another call.
+  // We need to make sure that the enclosing function knows to save S2
+  // This should have already been handled.
+  //
+  // Mov $18, $31
+
+  EmitInstrRegRegReg(Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO);
+
+  EmitSwapFPIntParams(Signature->ParamSig, LE, true);
+
+  // Jal xxxx
+  //
+  EmitJal(MSymbol);
+
+  // fix return values
+  EmitSwapFPIntRetval(Signature->RetSig, LE);
+  //
+  // do the return
+  // if (Signature->RetSig == NoFPRet)
+  //  llvm_unreachable("should not be any stubs here with no return value");
+  // else
+  EmitInstrReg(Mips::JR, Mips::S2);
+
+  MCSymbol *Tmp = OutContext.CreateTempSymbol();
+  OutStreamer.EmitLabel(Tmp);
+  const MCSymbolRefExpr *E = MCSymbolRefExpr::Create(Stub, OutContext);
+  const MCSymbolRefExpr *T = MCSymbolRefExpr::Create(Tmp, OutContext);
+  const MCExpr *T_min_E = MCBinaryExpr::CreateSub(T, E, OutContext);
+  OutStreamer.EmitELFSize(Stub, T_min_E);
+  TS.emitDirectiveEnd(x);
+  OutStreamer.PopSection();
 }
 
 void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
-  // Emit Mips ELF register info
-  Subtarget->getMReginfo().emitMipsReginfoSectionCG(
-             OutStreamer, getObjFileLowering(), *Subtarget);
-  emitELFHeaderFlagsCG(getTargetStreamer(), *Subtarget);
+  // Emit needed stubs
+  //
+  for (std::map<
+           const char *,
+           const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+           it = StubsNeeded.begin();
+       it != StubsNeeded.end(); ++it) {
+    const char *Symbol = it->first;
+    const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+    EmitFPCallStub(Symbol, Signature);
+  }
+  // return to the text section
+  OutStreamer.SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
 }
 
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
@@ -699,6 +919,28 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
   // TODO: implement
 }
 
+// Align all targets of indirect branches on bundle size.  Used only if target
+// is NaCl.
+void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
+  // Align all blocks that are jumped to through jump table.
+  if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
+    const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
+    for (unsigned I = 0; I < JT.size(); ++I) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[I].MBBs;
+
+      for (unsigned J = 0; J < MBBs.size(); ++J)
+        MBBs[J]->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+    }
+  }
+
+  // If basic block address is taken, block can be target of indirect branch.
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+                                 MBB != E; ++MBB) {
+    if (MBB->hasAddressTaken())
+      MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+  }
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 11c6acd..3e9093e 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -14,6 +14,7 @@
 #ifndef MIPSASMPRINTER_H
 #define MIPSASMPRINTER_H
 
+#include "Mips16HardFloatInfo.h"
 #include "MipsMCInstLower.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
@@ -50,7 +51,29 @@ private:
   /// pool entries so we can properly mark them as data regions.
   bool InConstantPool;
 
-  bool UsingConstantPools;
+  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  StubsNeeded;
+
+  void EmitJal(MCSymbol *Symbol);
+
+  void EmitInstrReg(unsigned Opcode, unsigned Reg);
+
+  void EmitInstrRegReg(unsigned Opcode, unsigned Reg1, unsigned Reg2);
+
+  void EmitInstrRegRegReg(unsigned Opcode, unsigned Reg1, unsigned Reg2,
+                          unsigned Reg3);
+
+  void EmitMovFPIntPair(unsigned MovOpc, unsigned Reg1, unsigned Reg2,
+                        unsigned FPReg1, unsigned FPReg2, bool LE);
+
+  void EmitSwapFPIntParams(Mips16HardFloatInfo::FPParamVariant, bool LE,
+                           bool ToFP);
+
+  void EmitSwapFPIntRetval(Mips16HardFloatInfo::FPReturnVariant, bool LE);
+
+  void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
+
+  void NaClAlignIndirectJumpTargets(MachineFunction &MF);
 
 public:
 
@@ -62,8 +85,6 @@ public:
     : AsmPrinter(TM, Streamer), MCP(0), InConstantPool(false),
       MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
-    UsingConstantPools =
-      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
   }
 
   virtual const char *getPassName() const {
@@ -72,15 +93,16 @@ public:
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
-  virtual void EmitConstantPool() LLVM_OVERRIDE {
+  virtual void EmitConstantPool() override {
+    bool UsingConstantPools =
+      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
     if (!UsingConstantPools)
       AsmPrinter::EmitConstantPool();
     // we emit constant pools customly!
   }
 
   void EmitInstruction(const MachineInstr *MI);
-  void printSavedRegsBitmask(raw_ostream &O);
-  void printHex32(unsigned int Value, raw_ostream &O);
+  void printSavedRegsBitmask();
   void emitFrameDirective();
   const char *getCurrentABIString() const;
   virtual void EmitFunctionEntryLabel();
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 66391cb..615310f 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -192,8 +192,15 @@ def CC_Mips_FastCC : CallingConv<[
 
   // Integer arguments are passed in integer registers. All scratch registers,
   // except for AT, V0 and T9, are available to be used as argument registers.
-  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6,
-                                 T7, T8, V1]>>,
+  CCIfType<[i32], CCIfSubtarget<"isNotTargetNaCl()",
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
+
+  // In NaCl, T6, T7 and T8 are reserved and not available as argument
+  // registers for fastcc.  T6 contains the mask for sandboxing control flow
+  // (indirect jumps and calls).  T7 contains the mask for sandboxing memory
+  // accesses (loads and stores).  T8 contains the thread pointer.
+  CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()",
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
 
   // f32 arguments are passed in single-precision floating pointer registers.
   CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10,
@@ -246,4 +253,6 @@ def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
                                    GP_64, (sequence "S%u_64", 7, 0))>;
 
 def CSR_Mips16RetHelper :
-  CalleeSavedRegs<(add V0, V1, (sequence "A%u", 3, 0), S0, S1)>;
+  CalleeSavedRegs<(add V0, V1, FP,
+                   (sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
+                   (sequence "D%u", 15, 10))>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index ca4163d..ea49086 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -24,8 +24,8 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/Passes.h"
@@ -112,13 +112,11 @@ private:
   unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMSAMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
-  void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
-                                  int Offset) const;
-
   /// Expand pseudo instructions with accumulator register operands.
   void expandACCInstr(MachineBasicBlock::instr_iterator MI,
                       MachineBasicBlock &MBB, unsigned Opc) const;
@@ -224,6 +222,12 @@ unsigned MipsCodeEmitter::getMemEncodingMMImm12(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getMSAMemEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
                                              unsigned OpNo) const {
   // size is encoded as size-1.
@@ -273,14 +277,6 @@ void MipsCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
                                              MayNeedFarStub));
 }
 
-void MipsCodeEmitter::emitGlobalAddressUnaligned(const GlobalValue *GV,
-                                           unsigned Reloc, int Offset) const {
-  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
-                             const_cast<GlobalValue *>(GV), 0, false));
-  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset() + Offset,
-                      Reloc, const_cast<GlobalValue *>(GV), 0, false));
-}
-
 void MipsCodeEmitter::
 emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
   MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 2de1430..567eef9 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -27,7 +27,7 @@ class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
 class CMov_I_F_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
                   InstrItinClass Itin> :
   InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
-         !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR> {
+         !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR, opstr> {
   let Constraints = "$F = $fd";
 }
 
@@ -47,7 +47,7 @@ class CMov_F_F_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
   InstSE<(outs RC:$fd), (ins RC:$fs, FCCRegsOpnd:$fcc, RC:$F),
          !strconcat(opstr, "\t$fd, $fs, $fcc"),
          [(set RC:$fd, (OpNode RC:$fs, FCCRegsOpnd:$fcc, RC:$F))],
-         Itin, FrmFR> {
+         Itin, FrmFR, opstr> {
   let Constraints = "$F = $fd";
 }
 
@@ -103,94 +103,94 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 }
 
 // Instantiation of instructions.
-def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, IIArith>,
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
                ADD_FM<0, 0xa>;
 
 let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, IIArith>,
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
                      ADD_FM<0, 0xa>;
-  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, IIArith>,
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
                      ADD_FM<0, 0xa>;
-  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, IIArith>,
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
                      ADD_FM<0, 0xa>;
 }
 
-def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, IIArith>,
+def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
                      ADD_FM<0, 0xb>;
 
 let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, IIArith>,
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
                      ADD_FM<0, 0xb>;
-  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, IIArith>,
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
                      ADD_FM<0, 0xb>;
-  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, IIArith>,
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
                      ADD_FM<0, 0xb>;
 }
 
-def MOVZ_I_S : CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, IIFmove>,
+def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
                CMov_I_F_FM<18, 16>;
 
 let isCodeGenOnly = 1 in
-def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, IIFmove>,
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
                  CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVN_I_S : CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, IIFmove>,
+def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
                CMov_I_F_FM<19, 16>;
 
 let isCodeGenOnly = 1 in
-def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, IIFmove>,
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
                  CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
-                   CMov_I_F_FM<18, 17>;
-  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
-                   CMov_I_F_FM<19, 17>;
+  def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                      II_MOVZ_D>, CMov_I_F_FM<18, 17>;
+  def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                      II_MOVN_D>, CMov_I_F_FM<19, 17>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, IIFmove>,
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
                    CMov_I_F_FM<18, 17>;
-  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, IIFmove>,
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
                    CMov_I_F_FM<19, 17>;
   let isCodeGenOnly = 1 in {
     def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
-                                   IIFmove>, CMov_I_F_FM<18, 17>;
+                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>;
     def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
-                                   IIFmove>, CMov_I_F_FM<19, 17>;
+                                   II_MOVN_D>, CMov_I_F_FM<19, 17>;
   }
 }
 
-def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIArith, MipsCMovFP_T>,
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
              CMov_F_I_FM<1>;
 
 let isCodeGenOnly = 1 in
-def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, IIArith, MipsCMovFP_T>,
+def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
                CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIArith, MipsCMovFP_F>,
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
              CMov_F_I_FM<0>;
 
 let isCodeGenOnly = 1 in
-def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, IIArith, MipsCMovFP_F>,
+def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
                CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVT_S : CMov_F_F_FT<"movt.s", FGR32Opnd, IIFmove, MipsCMovFP_T>,
+def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
              CMov_F_F_FM<16, 1>;
-def MOVF_S : CMov_F_F_FT<"movf.s", FGR32Opnd, IIFmove, MipsCMovFP_F>,
+def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
              CMov_F_F_FM<16, 0>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64Opnd, IIFmove, MipsCMovFP_T>,
-                 CMov_F_F_FM<17, 1>;
-  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64Opnd, IIFmove, MipsCMovFP_F>,
-                 CMov_F_F_FM<17, 0>;
+  def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                    MipsCMovFP_T>, CMov_F_F_FM<17, 1>;
+  def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                    MipsCMovFP_F>, CMov_F_F_FM<17, 0>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, IIFmove, MipsCMovFP_T>,
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, IIFmove, MipsCMovFP_F>,
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
 
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index c46bbac..e5642ba 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -17,7 +17,7 @@
 //
 // The constants can be not just numbers but addresses of functions and labels.
 // This can be particularly helpful in static relocation mode for embedded
-// non linux targets.
+// non-linux targets.
 //
 //
 
@@ -34,15 +34,15 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/Format.h"
 #include <algorithm>
 
 using namespace llvm;
@@ -77,6 +77,113 @@ static cl::opt<bool> NoLoadRelaxation(
   cl::desc("Don't relax loads to long loads - for testing purposes"),
   cl::Hidden);
 
+static unsigned int branchTargetOperand(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+  case Mips::Bteqz16:
+  case Mips::BteqzX16:
+  case Mips::Btnez16:
+  case Mips::BtnezX16:
+  case Mips::JalB16:
+    return 0;
+  case Mips::BeqzRxImm16:
+  case Mips::BeqzRxImmX16:
+  case Mips::BnezRxImm16:
+  case Mips::BnezRxImmX16:
+    return 1;
+  }
+  llvm_unreachable("Unknown branch type");
+}
+
+static bool isUnconditionalBranch(unsigned int Opcode) {
+  switch (Opcode) {
+  default: return false;
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+  case Mips::JalB16:
+    return true;
+  }
+}
+
+static unsigned int longformBranchOpcode(unsigned int Opcode) {
+  switch (Opcode) {
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+    return Mips::BimmX16;
+  case Mips::Bteqz16:
+  case Mips::BteqzX16:
+    return Mips::BteqzX16;
+  case Mips::Btnez16:
+  case Mips::BtnezX16:
+    return Mips::BtnezX16;
+  case Mips::JalB16:
+    return Mips::JalB16;
+  case Mips::BeqzRxImm16:
+  case Mips::BeqzRxImmX16:
+    return Mips::BeqzRxImmX16;
+  case Mips::BnezRxImm16:
+  case Mips::BnezRxImmX16:
+    return Mips::BnezRxImmX16;
+  }
+  llvm_unreachable("Unknown branch type");
+}
+
+//
+// FIXME: need to go through this whole constant islands port and check the math
+// for branch ranges and clean this up and make some functions to calculate things
+// that are done many times identically.
+// Need to refactor some of the code to call this routine.
+//
+static unsigned int branchMaxOffsets(unsigned int Opcode) {
+  unsigned Bits, Scale;
+  switch (Opcode) {
+    case Mips::Bimm16:
+      Bits = 11;
+      Scale = 2;
+      break;
+    case Mips::BimmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::BeqzRxImm16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BeqzRxImmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::BnezRxImm16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BnezRxImmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::Bteqz16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BteqzX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::Btnez16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BtnezX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    default:
+      llvm_unreachable("Unknown branch type");
+  }
+  unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+  return MaxOffs;
+}
 
 namespace {
 
@@ -277,16 +384,12 @@ namespace {
     unsigned getOffsetOf(MachineInstr *MI) const;
     unsigned getUserOffset(CPUser&) const;
     void dumpBBs();
-    void verify();
 
     bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
                          unsigned Disp, bool NegativeOK);
     bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
                          const CPUser &U);
 
-    bool isLongFormOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
-                                const CPUser &U);
-
     void computeBlockSize(MachineBasicBlock *MBB);
     MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
     void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
@@ -320,14 +423,6 @@ namespace {
   char MipsConstantIslands::ID = 0;
 } // end of anonymous namespace
 
-
-bool MipsConstantIslands::isLongFormOffsetInRange
-  (unsigned UserOffset, unsigned TrialOffset,
-   const CPUser &U) {
-  return isOffsetInRange(UserOffset, TrialOffset,
-                         U.getLongFormMaxDisp(), U.NegOk);
-}
-
 bool MipsConstantIslands::isOffsetInRange
   (unsigned UserOffset, unsigned TrialOffset,
    const CPUser &U) {
@@ -509,10 +604,10 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
   // Can't fall off end of function.
-  if (llvm::next(MBBI) == MBB->getParent()->end())
+  if (std::next(MBBI) == MBB->getParent()->end())
     return false;
 
-  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  MachineBasicBlock *NextBB = std::next(MBBI);
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I)
     if (*I == NextBB)
@@ -603,6 +698,55 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           Bits = 16;
           Scale = 2;
           isCond = false;
+          break;
+        case Mips::BeqzRxImm16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BeqzRxImmX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BnezRxImm16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BnezRxImmX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::Bteqz16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BteqzX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::Btnez16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BtnezX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
         }
         // Record this immediate branch.
         unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
@@ -634,11 +778,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             Bits = 8;
             Scale = 4;
             LongFormOpcode = Mips::LwRxPcTcpX16;
-            LongFormBits = 16;
+            LongFormBits = 14;
             LongFormScale = 1;
             break;
           case Mips::LwRxPcTcpX16:
-            Bits = 16;
+            Bits = 14;
             Scale = 1;
             NegOk = true;
             break;
@@ -776,7 +920,7 @@ MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
                      CompareMBBNumbers);
   MachineBasicBlock* WaterBB = *IP;
   if (WaterBB == OrigBB)
-    WaterList.insert(llvm::next(IP), NewBB);
+    WaterList.insert(std::next(IP), NewBB);
   else
     WaterList.insert(IP, OrigBB);
   NewWaterList.insert(OrigBB);
@@ -1062,7 +1206,7 @@ bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
     return false;
 
   unsigned BestGrowth = ~0u;
-  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
        --IP) {
     MachineBasicBlock* WaterBB = *IP;
     // Check if water is in range and is either at a lower address than the
@@ -1121,7 +1265,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
             << format(", expected CPE offset %#x\n", CPEOffset));
-      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      NewMBB = std::next(MachineFunction::iterator(UserMBB));
       // Add an unconditional branch from UserMBB to fallthrough block.  Record
       // it for branch lengthening; this new branch will not get out of range,
       // but if the preceding conditional branch is out of range, the targets
@@ -1174,8 +1318,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
   //MachineInstr *LastIT = 0;
   for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
        Offset < BaseInsertOffset;
-       Offset += TII->GetInstSizeInBytes(MI),
-       MI = llvm::next(MI)) {
+       Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
     assert(MI != UserMBB->end() && "Fell off end of block");
     if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
       CPUser &U = CPUsers[CPUIndex];
@@ -1232,7 +1375,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
       NewWaterList.insert(NewIsland);
 
     // The new CPE goes before the following block (NewMBB).
-    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+    NewMBB = std::next(MachineFunction::iterator(WaterBB));
 
   } else {
     // No water found.
@@ -1250,7 +1393,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // next iteration for constant pools, but in this context, we don't want
     // it.  Check for this so it will be removed from the WaterList.
     // Also remove any entry from NewWaterList.
-    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB));
     IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
     if (IP != WaterList.end())
       NewWaterList.erase(WaterBB);
@@ -1275,6 +1418,10 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   // Decrement the old entry, and remove it if refcount becomes 0.
   decrementCPEReferenceCount(CPI, CPEMI);
 
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
   // Now that we have an island to add the CPE to, clone the original CPE and
   // add it to the island.
   U.HighWaterMark = NewIsland;
@@ -1288,11 +1435,9 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
-  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+  adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland)));
+
 
-  // No existing clone of this CPE is within range.
-  // We will be generating a new clone.  Get a UID for it.
-  unsigned ID = createPICLabelUId();
 
   // Finally, change the CPI in the instruction operand to be ID.
   for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1380,7 +1525,8 @@ unsigned PCAdj = 4;
 /// away to fit in its displacement field.
 bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
-  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+  unsigned TargetOperand = branchTargetOperand(MI);
+  MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
 
   // Check to see if the DestBB is already in-range.
   if (isBBInRange(MI, DestBB, Br.MaxDisp))
@@ -1399,9 +1545,29 @@ bool
 MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
   // Use BL to implement far jump.
-  Br.MaxDisp = ((1 << 16)-1) * 2;
-  MI->setDesc(TII->get(Mips::BimmX16));
+  unsigned BimmX16MaxDisp = ((1 << 16)-1) * 2;
+  if (isBBInRange(MI, DestBB, BimmX16MaxDisp)) {
+    Br.MaxDisp = BimmX16MaxDisp;
+    MI->setDesc(TII->get(Mips::BimmX16));
+  }
+  else {
+    // need to give the math a more careful look here
+    // this is really a segment address and not
+    // a PC relative address. FIXME. But I think that
+    // just reducing the bits by 1 as I've done is correct.
+    // The basic block we are branching too much be longword aligned.
+    // we know that RA is saved because we always save it right now.
+    // this requirement will be relaxed later but we also have an alternate
+    // way to implement this that I will implement that does not need jal.
+    // We should have a way to back out this alignment restriction if we "can" later.
+    // but it is not harmful.
+    //
+    DestBB->setAlignment(2);
+    Br.MaxDisp = ((1<<24)-1) * 2;
+    MI->setDesc(TII->get(Mips::JalB16));
+  }
   BBInfo[MBB->getNumber()].Size += 2;
   adjustBBOffsetsAfter(MBB);
   HasFarJump = true;
@@ -1412,23 +1578,33 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   return true;
 }
 
+
 /// fixupConditionalBr - Fix up a conditional branch whose destination is too
 /// far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
 bool
 MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
-  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+  unsigned TargetOperand = branchTargetOperand(MI);
+  MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+  unsigned Opcode = MI->getOpcode();
+  unsigned LongFormOpcode = longformBranchOpcode(Opcode);
+  unsigned LongFormMaxOff = branchMaxOffsets(LongFormOpcode);
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, LongFormMaxOff)) {
+    Br.MaxDisp = LongFormMaxOff;
+    MI->setDesc(TII->get(LongFormOpcode));
+    return true;
+  }
 
   // Add an unconditional branch to the destination and invert the branch
   // condition to jump over it:
-  // blt L1
+  // bteqz L1
   // =>
-  // bge L2
+  // bnez L2
   // b   L1
   // L2:
-  unsigned CCReg = 0;  // FIXME
-  unsigned CC=0; //FIXME
 
   // If the branch is at the end of its MBB and that has a fall-through block,
   // direct the updated conditional branch to the fall-through block. Otherwise,
@@ -1436,29 +1612,34 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   MachineBasicBlock *MBB = MI->getParent();
   MachineInstr *BMI = &MBB->back();
   bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
+  unsigned OppositeBranchOpcode = TII->getOppositeBranchOpc(Opcode);
+ 
   ++NumCBrFixed;
   if (BMI != MI) {
-    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
-        BMI->getOpcode() == Br.UncondBr) {
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+        isUnconditionalBranch(BMI->getOpcode())) {
       // Last MI in the BB is an unconditional branch. Can we simply invert the
       // condition and swap destinations:
-      // beq L1
+      // beqz L1
       // b   L2
       // =>
-      // bne L2
+      // bnez L2
       // b   L1
-      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      unsigned BMITargetOperand = branchTargetOperand(BMI);
+      MachineBasicBlock *NewDest = 
+        BMI->getOperand(BMITargetOperand).getMBB();
       if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
         DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
                      << *BMI);
-        BMI->getOperand(0).setMBB(DestBB);
-        MI->getOperand(0).setMBB(NewDest);
+        MI->setDesc(TII->get(OppositeBranchOpcode));
+        BMI->getOperand(BMITargetOperand).setMBB(DestBB);
+        MI->getOperand(TargetOperand).setMBB(NewDest);
         return true;
       }
     }
   }
 
+
   if (NeedSplit) {
     splitBlockBeforeInstr(MI);
     // No need for the branch to the next block. We're adding an unconditional
@@ -1468,7 +1649,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     MBB->back().eraseFromParent();
     // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
   }
-  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
 
   DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
                << " also invert condition and change dest. to BB#"
@@ -1476,8 +1657,14 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
 
   // Insert a new conditional branch and a new unconditional branch.
   // Also update the ImmBranch as well as adding a new entry for the new branch.
-  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
-    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  if (MI->getNumExplicitOperands() == 2) {
+    BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+           .addReg(MI->getOperand(0).getReg())
+           .addMBB(NextBB);
+  } else {
+    BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+           .addMBB(NextBB);
+  }
   Br.MI = &MBB->back();
   BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
   BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
@@ -1496,13 +1683,13 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
 void MipsConstantIslands::prescanForConstants() {
   unsigned J = 0;
   (void)J;
-  PrescannedForConstants = true;
   for (MachineFunction::iterator B =
          MF->begin(), E = MF->end(); B != E; ++B) {
     for (MachineBasicBlock::instr_iterator I =
         B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
       switch(I->getDesc().getOpcode()) {
         case Mips::LwConstant32: {
+          PrescannedForConstants = true;
           DEBUG(dbgs() << "constant island constant " << *I << "\n");
           J = I->getNumOperands();
           DEBUG(dbgs() << "num operands " << J  << "\n");
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index ffbd83b..eef9f38 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "delay-slot-filler"
 
+#include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
 #include "MipsTargetMachine.h"
@@ -65,20 +66,6 @@ namespace {
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
   typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
 
-  /// \brief A functor comparing edge weight of two blocks.
-  struct CmpWeight {
-    CmpWeight(const MachineBasicBlock &S,
-              const MachineBranchProbabilityInfo &P) : Src(S), Prob(P) {}
-
-    bool operator()(const MachineBasicBlock *Dst0,
-                    const MachineBasicBlock *Dst1) const {
-      return Prob.getEdgeWeight(&Src, Dst0) < Prob.getEdgeWeight(&Src, Dst1);
-    }
-
-    const MachineBasicBlock &Src;
-    const MachineBranchProbabilityInfo &Prob;
-  };
-
   class RegDefsUses {
   public:
     RegDefsUses(TargetMachine &TM);
@@ -514,8 +501,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     // Bundle the NOP to the instruction with the delay slot.
     const MipsInstrInfo *TII =
       static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
-    BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
-    MIBundleBuilder(MBB, I, llvm::next(llvm::next(I)));
+    BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+    MIBundleBuilder(MBB, I, std::next(I, 2));
   }
 
   return Changed;
@@ -545,6 +532,18 @@ bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
     if (delayHasHazard(*I, RegDU, IM))
       continue;
 
+    if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+      // In NaCl, instructions that must be masked are forbidden in delay slots.
+      // We only check for loads, stores and SP changes.  Calls, returns and
+      // branches are not checked because non-NaCl targets never put them in
+      // delay slots.
+      unsigned AddrIdx;
+      if ((isBasePlusOffsetMemoryAccess(I->getOpcode(), &AddrIdx)
+           && baseRegNeedsLoadStoreMask(I->getOperand(AddrIdx).getReg()))
+          || I->modifiesRegister(Mips::SP, TM.getRegisterInfo()))
+        continue;
+    }
+
     Filler = I;
     return true;
   }
@@ -565,8 +564,8 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
   if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler))
     return false;
 
-  MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
-  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  MBB.splice(std::next(Slot), &MBB, std::next(Filler).base());
+  MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
   ++UsefulSlots;
   return true;
 }
@@ -582,11 +581,11 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.setCallerSaved(*Slot);
 
-  if (!searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler))
+  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Filler))
     return false;
 
-  MBB.splice(llvm::next(Slot), &MBB, Filler);
-  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  MBB.splice(std::next(Slot), &MBB, Filler);
+  MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
   ++UsefulSlots;
   return true;
 }
@@ -640,8 +639,12 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
     return NULL;
 
   // Select the successor with the larget edge weight.
-  CmpWeight Cmp(B, getAnalysis<MachineBranchProbabilityInfo>());
-  MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(), Cmp);
+  auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
+  MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(),
+                                           [&](const MachineBasicBlock *Dst0,
+                                               const MachineBasicBlock *Dst1) {
+    return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1);
+  });
   return S->isLandingPad() ? NULL : S;
 }
 
@@ -714,6 +717,6 @@ bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
 
 bool Filler::terminateSearch(const MachineInstr &Candidate) const {
   return (Candidate.isTerminator() || Candidate.isCall() ||
-          Candidate.isLabel() || Candidate.isInlineAsm() ||
+          Candidate.isPosition() || Candidate.isInlineAsm() ||
           Candidate.hasUnmodeledSideEffects());
 }
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index c417bd5..941aeac 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -13,23 +13,23 @@
 
 #define DEBUG_TYPE "mips-isel"
 #include "MipsISelDAGToDAG.h"
-#include "Mips16ISelDAGToDAG.h"
-#include "MipsSEISelDAGToDAG.h"
-#include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "Mips16ISelDAGToDAG.h"
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
+#include "MipsSEISelDAGToDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -93,6 +93,12 @@ bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                                     SDValue &Offset, SDValue &Alias) {
   llvm_unreachable("Unimplemented function.");
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index a4d9da5..4546182 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -73,6 +73,10 @@ private:
   virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
                                SDValue &Offset) const;
 
+  /// Match addr+simm10 and addr
+  virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                             SDValue &Offset, SDValue &Alias);
 
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 1e8250c..abf36da 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -197,16 +197,14 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::ILVR:              return "MipsISD::ILVR";
   case MipsISD::PCKEV:             return "MipsISD::PCKEV";
   case MipsISD::PCKOD:             return "MipsISD::PCKOD";
+  case MipsISD::INSVE:             return "MipsISD::INSVE";
   default:                         return NULL;
   }
 }
 
-MipsTargetLowering::
-MipsTargetLowering(MipsTargetMachine &TM)
-  : TargetLowering(TM, new MipsTargetObjectFile()),
-    Subtarget(&TM.getSubtarget<MipsSubtarget>()),
-    HasMips64(Subtarget->hasMips64()), IsN64(Subtarget->isABI_N64()),
-    IsO32(Subtarget->isABI_O32()) {
+MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
+    : TargetLowering(TM, new MipsTargetObjectFile()),
+      Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
   // Mips does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -252,7 +250,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::FABS,             MVT::f64,   Custom);
   }
 
-  if (HasMips64) {
+  if (hasMips64()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
     setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
@@ -264,14 +262,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
-  if (!HasMips64) {
+  if (!hasMips64()) {
     setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::ADD,                MVT::i32,   Custom);
-  if (HasMips64)
+  if (hasMips64())
     setOperationAction(ISD::ADD,                MVT::i64,   Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -294,8 +292,13 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
   setOperationAction(ISD::FP_TO_UINT,        MVT::i64,   Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
-  setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
-  setOperationAction(ISD::CTPOP,             MVT::i64,   Expand);
+  if (Subtarget->hasCnMips()) {
+    setOperationAction(ISD::CTPOP,           MVT::i32,   Legal);
+    setOperationAction(ISD::CTPOP,           MVT::i64,   Legal);
+  } else {
+    setOperationAction(ISD::CTPOP,           MVT::i32,   Expand);
+    setOperationAction(ISD::CTPOP,           MVT::i64,   Expand);
+  }
   setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,              MVT::i64,   Expand);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i32,   Expand);
@@ -368,7 +371,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
   }
 
-  if (HasMips64) {
+  if (hasMips64()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
@@ -384,14 +387,16 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
 
-  setMinFunctionAlignment(HasMips64 ? 3 : 2);
+  setMinFunctionAlignment(hasMips64() ? 3 : 2);
 
-  setStackPointerRegisterToSaveRestore(IsN64 ? Mips::SP_64 : Mips::SP);
+  setStackPointerRegisterToSaveRestore(isN64() ? Mips::SP_64 : Mips::SP);
 
-  setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0);
-  setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1);
+  setExceptionPointerRegister(isN64() ? Mips::A0_64 : Mips::A0);
+  setExceptionSelectorRegister(isN64() ? Mips::A1_64 : Mips::A1);
 
   MaxStoresPerMemcpy = 16;
+
+  isMicroMips = Subtarget->inMicroMipsMode();
 }
 
 const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
@@ -535,19 +540,65 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
   if (!FalseTy.isInteger())
     return SDValue();
 
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(False);
+  ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(False);
 
-  if (!CN || CN->getZExtValue())
+  // If the RHS (False) is 0, we swap the order of the operands
+  // of ISD::SELECT (obviously also inverting the condition) so that we can
+  // take advantage of conditional moves using the $0 register.
+  // Example:
+  //   return (a != 0) ? x : 0;
+  //     load $reg, x
+  //     movz $reg, $0, a
+  if (!FalseC)
     return SDValue();
 
   const SDLoc DL(N);
-  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+
+  if (!FalseC->getZExtValue()) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+    SDValue True = N->getOperand(1);
+
+    SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                         SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+
+    return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
+  }
+
+  // If both operands are integer constants there's a possibility that we
+  // can do some interesting optimizations.
   SDValue True = N->getOperand(1);
+  ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(True);
 
-  SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
-                       SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+  if (!TrueC || !True.getValueType().isInteger())
+    return SDValue();
+
+  // We'll also ignore MVT::i64 operands as this optimizations proves
+  // to be ineffective because of the required sign extensions as the result
+  // of a SETCC operator is always MVT::i32 for non-vector types.
+  if (True.getValueType() == MVT::i64)
+    return SDValue();
+
+  int64_t Diff = TrueC->getSExtValue() - FalseC->getSExtValue();
+
+  // 1)  (a < x) ? y : y-1
+  //  slti $reg1, a, x
+  //  addiu $reg2, $reg1, y-1
+  if (Diff == 1)
+    return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, False);
+
+  // 2)  (a < x) ? y-1 : y
+  //  slti $reg1, a, x
+  //  xor $reg1, $reg1, 1
+  //  addiu $reg2, $reg1, y-1
+  if (Diff == -1) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+    SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                         SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+    return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
+  }
 
-  return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
+  // Couldn't optimize.
+  return SDValue();
 }
 
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
@@ -770,7 +821,7 @@ static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
   MachineBasicBlock::iterator I(MI);
   MachineInstrBuilder MIB;
   MachineOperand &Divisor = MI->getOperand(2);
-  MIB = BuildMI(MBB, llvm::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
+  MIB = BuildMI(MBB, std::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
     .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
     .addReg(Mips::ZERO).addImm(7);
 
@@ -885,8 +936,8 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
-    LL = Mips::LL;
-    SC = Mips::SC;
+    LL = isMicroMips ? Mips::LL_MM : Mips::LL;
+    SC = isMicroMips ? Mips::SC_MM : Mips::SC;
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
@@ -920,7 +971,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
@@ -1006,7 +1057,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   BB->addSuccessor(loopMBB);
@@ -1128,8 +1179,8 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   unsigned LL, SC, ZERO, BNE, BEQ;
 
   if (Size == 4) {
-    LL = Mips::LL;
-    SC = Mips::SC;
+    LL = isMicroMips ? Mips::LL_MM : Mips::LL;
+    SC = isMicroMips ? Mips::SC_MM : Mips::SC;
     ZERO = Mips::ZERO;
     BNE = Mips::BNE;
     BEQ = Mips::BEQ;
@@ -1161,7 +1212,7 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
@@ -1247,7 +1298,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   BB->addSuccessor(loop1MBB);
@@ -1365,7 +1416,7 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
                         0);
   Chain = Addr.getValue(1);
 
-  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || IsN64) {
+  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || isN64()) {
     // For PIC, the sequence is:
     // BRIND(load(Jumptable + index) + RelocBase)
     // RelocBase can be JumpTable, GOT or some sort of global base.
@@ -1446,7 +1497,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) {
     const MipsTargetObjectFile &TLOF =
       (const MipsTargetObjectFile&)getObjFileLowering();
 
@@ -1465,15 +1516,15 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(N, Ty, DAG, HasMips64);
+    return getAddrLocal(N, Ty, DAG, isN32() || isN64());
 
   if (LargeGOT)
     return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
                                  MipsII::MO_GOT_LO16, DAG.getEntryNode(),
                                  MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(N, Ty, DAG,
-                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+  return getAddrGlobal(N, Ty, DAG, (isN32() || isN64()) ? MipsII::MO_GOT_DISP
+                                                        : MipsII::MO_GOT16,
                        DAG.getEntryNode(), MachinePointerInfo::getGOT());
 }
 
@@ -1482,10 +1533,10 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
   BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64())
     return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, isN32() || isN64());
 }
 
 SDValue MipsTargetLowering::
@@ -1575,10 +1626,10 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64())
     return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, isN32() || isN64());
 }
 
 SDValue MipsTargetLowering::
@@ -1596,10 +1647,10 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64())
     return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, isN32() || isN64());
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -1790,12 +1841,15 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
-                                         IsN64 ? Mips::FP_64 : Mips::FP, VT);
+                                         isN64() ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
 }
 
 SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
                                             SelectionDAG &DAG) const {
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   // check the depth
   assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
          "Return address can be determined only for current frame.");
@@ -1803,7 +1857,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MVT VT = Op.getSimpleValueType();
-  unsigned RA = IsN64 ? Mips::RA_64 : Mips::RA;
+  unsigned RA = isN64() ? Mips::RA_64 : Mips::RA;
   MFI->setReturnAddressIsTaken(true);
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
@@ -1825,12 +1879,12 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+  EVT Ty = isN64() ? MVT::i64 : MVT::i32;
 
   // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
   // EH_RETURN nodes, so that instructions are emitted back-to-back.
-  unsigned OffsetReg = IsN64 ? Mips::V1_64 : Mips::V1;
-  unsigned AddrReg = IsN64 ? Mips::V0_64 : Mips::V0;
+  unsigned OffsetReg = isN64() ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = isN64() ? Mips::V0_64 : Mips::V0;
   Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
   Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
   return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
@@ -2254,8 +2308,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   // in PIC mode) allow symbols to be resolved via lazy binding.
   // The lazy binding stub requires GP to point to the GOT.
   if (IsPICCall && !InternalLinkage) {
-    unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
-    EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+    unsigned GPReg = isN64() ? Mips::GP_64 : Mips::GP;
+    EVT Ty = isN64() ? MVT::i64 : MVT::i32;
     RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
   }
 
@@ -2285,7 +2339,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
       llvm::StringRef Sym = G->getGlobal()->getName();
       Function *F = G->getGlobal()->getParent()->getFunction(Sym);
-      if (F->hasFnAttribute("__Mips16RetHelper")) {
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
         Mask = MipsRegisterInfo::getMips16RetHelperMask();
       }
     }
@@ -2324,7 +2378,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                  getTargetMachine(), ArgLocs, *DAG.getContext());
   MipsCC::SpecialCallingConvType SpecialCallingConv =
     getSpecialCallingConv(Callee);
-  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo,
+  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo,
                     SpecialCallingConv);
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
@@ -2353,9 +2407,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL,
-                                        IsN64 ? Mips::SP_64 : Mips::SP,
-                                        getPointerTy());
+  SDValue StackPtr = DAG.getCopyFromReg(
+      Chain, DL, isN64() ? Mips::SP_64 : Mips::SP, getPointerTy());
 
   // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
@@ -2442,7 +2495,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
+  bool IsPICCall = (isN64() || IsPIC); // true if calls are translated to
+                                       // jalr $25
   bool GlobalOrExternal = false, InternalLinkage = false;
   SDValue CalleeLo;
   EVT Ty = Callee.getValueType();
@@ -2453,7 +2507,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(G, Ty, DAG, HasMips64);
+        Callee = getAddrLocal(G, Ty, DAG, isN32() || isN64());
       else if (LargeGOT)
         Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
@@ -2469,7 +2523,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
 
-    if (!IsN64 && !IsPIC) // !N64 && static
+    if (!isN64() && !IsPIC) // !N64 && static
       Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
                                             MipsII::MO_NO_FLAG);
     else if (LargeGOT)
@@ -2492,7 +2546,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (IsTailCall)
     return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, &Ops[0], Ops.size());
 
-  Chain  = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, &Ops[0], Ops.size());
   SDValue InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -2520,7 +2574,7 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
+  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo);
 
   MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(),
                                CallNode, RetTy);
@@ -2567,7 +2621,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
+  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
   bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat();
@@ -2629,7 +2683,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           (RegVT == MVT::i64 && ValVT == MVT::f64) ||
           (RegVT == MVT::f64 && ValVT == MVT::i64))
         ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
-      else if (IsO32 && RegVT == MVT::i32 && ValVT == MVT::f64) {
+      else if (isO32() && RegVT == MVT::i32 && ValVT == MVT::f64) {
         unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
                                   getNextIntArgReg(ArgReg), RC);
         SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
@@ -2665,8 +2719,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
     unsigned Reg = MipsFI->getSRetReturnReg();
     if (!Reg) {
-      Reg = MF.getRegInfo().
-        createVirtualRegister(getRegClassFor(IsN64 ? MVT::i64 : MVT::i32));
+      Reg = MF.getRegInfo().createVirtualRegister(
+          getRegClassFor(isN64() ? MVT::i64 : MVT::i32));
       MipsFI->setSRetReturnReg(Reg);
     }
     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
@@ -2716,7 +2770,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
                  *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
+  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo);
 
   // Analyze return values.
   MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(),
@@ -2752,7 +2806,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
-    unsigned V0 = IsN64 ? Mips::V0_64 : Mips::V0;
+    unsigned V0 = isN64() ? Mips::V0_64 : Mips::V0;
 
     Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
@@ -2973,9 +3027,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if (VT == MVT::i64 && !HasMips64)
+      if (VT == MVT::i64 && !isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if (VT == MVT::i64 && HasMips64)
+      if (VT == MVT::i64 && isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
@@ -3162,7 +3216,7 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 }
 
 unsigned MipsTargetLowering::getJumpTableEncoding() const {
-  if (IsN64)
+  if (isN64())
     return MachineJumpTableInfo::EK_GPRel64BlockAddress;
 
   return TargetLowering::getJumpTableEncoding();
@@ -3216,7 +3270,7 @@ MipsTargetLowering::MipsCC::SpecialCallingConvType
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
       llvm::StringRef Sym = G->getGlobal()->getName();
       Function *F = G->getGlobal()->getParent()->getFunction(Sym);
-      if (F->hasFnAttribute("__Mips16RetHelper")) {
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
         SpecialCallingConv = MipsCC::Mips16RetHelperConv;
       }
     }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 65f68f0..35dd396 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -15,9 +15,9 @@
 #ifndef MipsISELLOWERING_H
 #define MipsISELLOWERING_H
 
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
 #include "MipsSubtarget.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
@@ -184,6 +184,9 @@ namespace llvm {
       PCKEV, // Pack even elements
       PCKOD, // Pack odd elements
 
+      // Vector Lane Copy
+      INSVE, // Copy element from one vector to another
+
       // Combined (XOR (OR $a, $b), -1)
       VNOR,
 
@@ -209,6 +212,7 @@ namespace llvm {
   class MipsFunctionInfo;
 
   class MipsTargetLowering : public TargetLowering  {
+    bool isMicroMips;
   public:
     explicit MipsTargetLowering(MipsTargetMachine &TM);
 
@@ -254,17 +258,17 @@ namespace llvm {
     // computing a local symbol's address:
     //
     // (add (load (wrapper $gp, %got(sym)), %lo(sym))
-    template<class NodeTy>
+    template <class NodeTy>
     SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
-                         bool HasMips64) const {
+                         bool IsN32OrN64) const {
       SDLoc DL(N);
-      unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+      unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
       SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                                 getTargetNode(N, Ty, DAG, GOTFlag));
       SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
                                  MachinePointerInfo::getGOT(), false, false,
                                  false, 0);
-      unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+      unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
       SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
                                getTargetNode(N, Ty, DAG, LoFlag));
       return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
@@ -428,7 +432,11 @@ namespace llvm {
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
-    bool HasMips64, IsN64, IsO32;
+    bool hasMips64() const { return Subtarget->hasMips64(); }
+    bool isGP64bit() const { return Subtarget->isGP64bit(); }
+    bool isO32() const { return Subtarget->isABI_O32(); }
+    bool isN32() const { return Subtarget->isABI_N32(); }
+    bool isN64() const { return Subtarget->isABI_N64(); }
 
   private:
     // Create a TargetGlobalAddress node.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 9f7ce9a..4b5a73e 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -93,15 +93,16 @@ class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
               SDPatternOperator OpNode= null_frag> :
   InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
          !strconcat(opstr, "\t$fd, $fs, $ft"),
-         [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR> {
+         [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR, opstr> {
   let isCommutable = IsComm;
 }
 
 multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
                   SDPatternOperator OpNode = null_frag> {
-  def _D32 : ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
+  def _D32 : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>,
+  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin,
+                     IsComm, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
@@ -110,12 +111,12 @@ multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
 class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
-         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR>,
+         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>,
   NeverHasSideEffects;
 
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
-  def _D32 : ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
+  def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
   def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
@@ -124,7 +125,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin,
 }
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
-  def _D32 : ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
+  def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
              Requires<[NotFP64bit, HasStdEnc]>;
   def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
              Requires<[IsFP64bit, HasStdEnc]> {
@@ -135,17 +136,17 @@ multiclass ROUND_M<string opstr, InstrItinClass Itin> {
 class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
-         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>;
 
 class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
-         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>;
 
 class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
             SDPatternOperator OpNode= null_frag> :
   InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
   let DecoderMethod = "DecodeFMem";
   let mayLoad = 1;
 }
@@ -153,7 +154,7 @@ class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
 class SW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
             SDPatternOperator OpNode= null_frag> :
   InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
   let DecoderMethod = "DecodeFMem";
   let mayStore = 1;
 }
@@ -162,20 +163,22 @@ class MADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
                SDPatternOperator OpNode = null_frag> :
   InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
          !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
-         [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin, FrmFR>;
+         [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin,
+         FrmFR, opstr>;
 
 class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
                 SDPatternOperator OpNode = null_frag> :
   InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
          !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
          [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
-         Itin, FrmFR>;
+         Itin, FrmFR, opstr>;
 
 class LWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
   InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fd, ${index}(${base})"),
-         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin, FrmFI> {
+         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin,
+         FrmFI, opstr> {
   let AddedComplexity = 20;
 }
 
@@ -183,15 +186,17 @@ class SWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
   InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fs, ${index}(${base})"),
-         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin, FrmFI> {
+         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin,
+         FrmFI, opstr> {
   let AddedComplexity = 20;
 }
 
-class BC1F_FT<string opstr, InstrItinClass Itin,
+class BC1F_FT<string opstr, DAGOperand opnd, InstrItinClass Itin,
               SDPatternOperator Op = null_frag>  :
-  InstSE<(outs), (ins FCCRegsOpnd:$fcc, brtarget:$offset),
+  InstSE<(outs), (ins FCCRegsOpnd:$fcc, opnd:$offset),
          !strconcat(opstr, "\t$fcc, $offset"),
-         [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin, FrmFI> {
+         [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin,
+         FrmFI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -202,129 +207,133 @@ class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
               SDPatternOperator OpNode = null_frag>  :
   InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
          !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
-         [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR> {
+         [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR,
+         !strconcat("c.$cond.", typestr)> {
   let Defs = [FCC0];
   let isCodeGenOnly = 1;
 }
 
-class C_COND_FT<string CondStr, string Typestr, RegisterOperand RC>  :
+class C_COND_FT<string CondStr, string Typestr, RegisterOperand RC,
+                InstrItinClass itin>  :
    InstSE<(outs), (ins RC:$fs, RC:$ft),
-          !strconcat("c.", CondStr, ".", Typestr, "\t$fs, $ft"), [], IIFcmp,
+          !strconcat("c.", CondStr, ".", Typestr, "\t$fs, $ft"), [], itin,
           FrmFR>;
 
-multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt> {
-  def C_F_#NAME : C_COND_FT<"f", TypeStr, RC>, C_COND_FM<fmt, 0>;
-  def C_UN_#NAME : C_COND_FT<"un", TypeStr, RC>, C_COND_FM<fmt, 1>;
-  def C_EQ_#NAME : C_COND_FT<"eq", TypeStr, RC>, C_COND_FM<fmt, 2>;
-  def C_UEQ_#NAME : C_COND_FT<"ueq", TypeStr, RC>, C_COND_FM<fmt, 3>;
-  def C_OLT_#NAME : C_COND_FT<"olt", TypeStr, RC>, C_COND_FM<fmt, 4>;
-  def C_ULT_#NAME : C_COND_FT<"ult", TypeStr, RC>, C_COND_FM<fmt, 5>;
-  def C_OLE_#NAME : C_COND_FT<"ole", TypeStr, RC>, C_COND_FM<fmt, 6>;
-  def C_ULE_#NAME : C_COND_FT<"ule", TypeStr, RC>, C_COND_FM<fmt, 7>;
-  def C_SF_#NAME : C_COND_FT<"sf", TypeStr, RC>, C_COND_FM<fmt, 8>;
-  def C_NGLE_#NAME : C_COND_FT<"ngle", TypeStr, RC>, C_COND_FM<fmt, 9>;
-  def C_SEQ_#NAME : C_COND_FT<"seq", TypeStr, RC>, C_COND_FM<fmt, 10>;
-  def C_NGL_#NAME : C_COND_FT<"ngl", TypeStr, RC>, C_COND_FM<fmt, 11>;
-  def C_LT_#NAME : C_COND_FT<"lt", TypeStr, RC>, C_COND_FM<fmt, 12>;
-  def C_NGE_#NAME : C_COND_FT<"nge", TypeStr, RC>, C_COND_FM<fmt, 13>;
-  def C_LE_#NAME : C_COND_FT<"le", TypeStr, RC>, C_COND_FM<fmt, 14>;
-  def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC>, C_COND_FM<fmt, 15>;
-}
-
-defm S : C_COND_M<"s", FGR32Opnd, 16>;
-defm D32 : C_COND_M<"d", AFGR64Opnd, 17>,
-                    Requires<[NotFP64bit, HasStdEnc]>;
+multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
+                    InstrItinClass itin> {
+  def C_F_#NAME : C_COND_FT<"f", TypeStr, RC, itin>, C_COND_FM<fmt, 0>;
+  def C_UN_#NAME : C_COND_FT<"un", TypeStr, RC, itin>, C_COND_FM<fmt, 1>;
+  def C_EQ_#NAME : C_COND_FT<"eq", TypeStr, RC, itin>, C_COND_FM<fmt, 2>;
+  def C_UEQ_#NAME : C_COND_FT<"ueq", TypeStr, RC, itin>, C_COND_FM<fmt, 3>;
+  def C_OLT_#NAME : C_COND_FT<"olt", TypeStr, RC, itin>, C_COND_FM<fmt, 4>;
+  def C_ULT_#NAME : C_COND_FT<"ult", TypeStr, RC, itin>, C_COND_FM<fmt, 5>;
+  def C_OLE_#NAME : C_COND_FT<"ole", TypeStr, RC, itin>, C_COND_FM<fmt, 6>;
+  def C_ULE_#NAME : C_COND_FT<"ule", TypeStr, RC, itin>, C_COND_FM<fmt, 7>;
+  def C_SF_#NAME : C_COND_FT<"sf", TypeStr, RC, itin>, C_COND_FM<fmt, 8>;
+  def C_NGLE_#NAME : C_COND_FT<"ngle", TypeStr, RC, itin>, C_COND_FM<fmt, 9>;
+  def C_SEQ_#NAME : C_COND_FT<"seq", TypeStr, RC, itin>, C_COND_FM<fmt, 10>;
+  def C_NGL_#NAME : C_COND_FT<"ngl", TypeStr, RC, itin>, C_COND_FM<fmt, 11>;
+  def C_LT_#NAME : C_COND_FT<"lt", TypeStr, RC, itin>, C_COND_FM<fmt, 12>;
+  def C_NGE_#NAME : C_COND_FT<"nge", TypeStr, RC, itin>, C_COND_FM<fmt, 13>;
+  def C_LE_#NAME : C_COND_FT<"le", TypeStr, RC, itin>, C_COND_FM<fmt, 14>;
+  def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>;
+}
+
+defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>,
+           Requires<[NotFP64bit, HasStdEnc]>;
 let DecoderNamespace = "Mips64" in
-defm D64 : C_COND_M<"d", FGR64Opnd, 17>, Requires<[IsFP64bit, HasStdEnc]>;
+defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>,
+           Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                  ABSS_FM<0xc, 16>;
-def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+def TRUNC_W_S  : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
                  ABSS_FM<0xd, 16>;
-def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+def CEIL_W_S   : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
                  ABSS_FM<0xe, 16>;
-def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+def FLOOR_W_S  : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
                  ABSS_FM<0xf, 16>;
-def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                  ABSS_FM<0x24, 16>;
 
-defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
-defm TRUNC_W : ROUND_M<"trunc.w.d", IIFcvt>, ABSS_FM<0xd, 17>;
-defm CEIL_W  : ROUND_M<"ceil.w.d", IIFcvt>, ABSS_FM<0xe, 17>;
-defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
-defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>;
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>;
+defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>;
+defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>;
+defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>;
+defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
                   ABSS_FM<0x8, 16>;
-  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
                     ABSS_FM<0x8, 17>;
-  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
                   ABSS_FM<0x9, 16>;
-  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
                     ABSS_FM<0x9, 17>;
-  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
                   ABSS_FM<0xa, 16>;
-  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
                    ABSS_FM<0xa, 17>;
-  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
                   ABSS_FM<0xb, 16>;
-  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
                     ABSS_FM<0xb, 17>;
 }
 
-def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, IIFcvt>,
+def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
               ABSS_FM<0x20, 20>;
-def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
               ABSS_FM<0x25, 16>;
-def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
                ABSS_FM<0x25, 17>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, IIFcvt>,
+  def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
                   ABSS_FM<0x20, 17>;
-  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, IIFcvt>,
+  def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
                   ABSS_FM<0x21, 20>;
-  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, IIFcvt>,
+  def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
                   ABSS_FM<0x21, 16>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, IIFcvt>,
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
                   ABSS_FM<0x20, 17>;
-  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, IIFcvt>,
+  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
                   ABSS_FM<0x20, 21>;
-  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, IIFcvt>,
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
                   ABSS_FM<0x21, 20>;
-  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
                   ABSS_FM<0x21, 16>;
-  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, IIFcvt>,
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
                   ABSS_FM<0x21, 21>;
 }
 
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
-  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
 }
 
 let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  def FABS_S : ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, IIFcvt, fabs>,
+  def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
                ABSS_FM<0x5, 16>;
-  def FNEG_S : ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, IIFcvt, fneg>,
+  def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
                ABSS_FM<0x7, 16>;
-  defm FABS : ABSS_M<"abs.d", IIFcvt, fabs>, ABSS_FM<0x5, 17>;
-  defm FNEG : ABSS_M<"neg.d", IIFcvt, fneg>, ABSS_FM<0x7, 17>;
+  defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+  defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
 }
 
-def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, IIFsqrtSingle,
-               fsqrt>, ABSS_FM<0x4, 16>;
-defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
+def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>,
+              ABSS_FM<0x4, 16>;
+defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>;
 
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
@@ -332,44 +341,44 @@ defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 // regardless of register aliasing.
 
 /// Move Control Registers From/To CPU Registers
-def CFC1 : MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, IIFmove>, MFC1_FM<2>;
-def CTC1 : MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, IIFmove>, MFC1_FM<6>;
-def MFC1 : MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, IIFmoveC1, bitconvert>,
-           MFC1_FM<0>;
-def MTC1 : MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, IIFmoveC1, bitconvert>,
-           MFC1_FM<4>;
-def MFHC1 : MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, IIFmoveC1>,
+def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>;
+def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>;
+def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
+                          bitconvert>, MFC1_FM<0>;
+def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
+                          bitconvert>, MFC1_FM<4>;
+def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
             MFC1_FM<3>;
-def MTHC1 : MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, IIFmoveC1>,
+def MTHC1 : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
             MFC1_FM<7>;
-def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, IIFmoveC1,
+def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
             bitconvert>, MFC1_FM<1>;
-def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, IIFmoveC1,
+def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
             bitconvert>, MFC1_FM<5>;
 
-def FMOV_S   : ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, IIFmove>,
+def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
                ABSS_FM<0x6, 16>;
-def FMOV_D32 : ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, IIFmove>,
+def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
                ABSS_FM<0x6, 17>, Requires<[NotFP64bit, HasStdEnc]>;
-def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, IIFmove>,
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
                ABSS_FM<0x6, 17>, Requires<[IsFP64bit, HasStdEnc]> {
                  let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
 let Predicates = [HasStdEnc] in {
-  def LWC1 : LW_FT<"lwc1", FGR32Opnd, IIFLoad, load>, LW_FM<0x31>;
-  def SWC1 : SW_FT<"swc1", FGR32Opnd, IIFStore, store>, LW_FM<0x39>;
+  def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
+  def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
-  def SDC164 : SW_FT<"sdc1", FGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>;
 }
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def LDC1 : LW_FT<"ldc1", AFGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
-  def SDC1 : SW_FT<"sdc1", AFGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
+  def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>;
+  def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>;
 }
 
 /// Cop2 Memory Instructions
@@ -381,87 +390,90 @@ let Predicates = [HasStdEnc] in {
 }
 
 // Indexed loads and stores.
-let Predicates = [HasFPIdx, HasStdEnc] in {
-  def LWXC1 : LWXC1_FT<"lwxc1", FGR32Opnd, IIFLoad, load>, LWXC1_FM<0>;
-  def SWXC1 : SWXC1_FT<"swxc1", FGR32Opnd, IIFStore, store>, SWXC1_FM<8>;
+// Base register + offset register addressing mode (indicated by "x" in the
+// instruction mnemonic) is disallowed under NaCl.
+let Predicates = [HasFPIdx, HasStdEnc, IsNotNaCl] in {
+  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>;
+  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>;
 }
 
-let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc] in {
-  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
-  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
+let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc, NotInMicroMips,
+                  IsNotNaCl] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>;
 }
 
 let Predicates = [HasFPIdx, IsFP64bit, HasStdEnc],
     DecoderNamespace="Mips64" in {
-  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
-  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>;
 }
 
 // Load/store doubleword indexed unaligned.
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def LUXC1 : LWXC1_FT<"luxc1", AFGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
-  def SUXC1 : SWXC1_FT<"suxc1", AFGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
+let Predicates = [NotFP64bit, HasStdEnc, IsNotNaCl] in {
+  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>;
+  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
-  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>;
 }
 
 /// Floating-point Aritmetic
-def FADD_S : ADDS_FT<"add.s", FGR32Opnd, IIFadd, 1, fadd>,
+def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
              ADDS_FM<0x00, 16>;
-defm FADD :  ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
-def FDIV_S : ADDS_FT<"div.s", FGR32Opnd, IIFdivSingle, 0, fdiv>,
+defm FADD :  ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
              ADDS_FM<0x03, 16>;
-defm FDIV :  ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
-def FMUL_S : ADDS_FT<"mul.s", FGR32Opnd, IIFmulSingle, 1, fmul>,
+defm FDIV :  ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
              ADDS_FM<0x02, 16>;
-defm FMUL :  ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
-def FSUB_S : ADDS_FT<"sub.s", FGR32Opnd, IIFadd, 0, fsub>,
+defm FMUL :  ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
              ADDS_FM<0x01, 16>;
-defm FSUB :  ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
+defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : MADDS_FT<"madd.s", FGR32Opnd, IIFmulSingle, fadd>,
+  def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
                MADDS_FM<4, 0>;
-  def MSUB_S : MADDS_FT<"msub.s", FGR32Opnd, IIFmulSingle, fsub>,
+  def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
                MADDS_FM<5, 0>;
 }
 
 let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32Opnd, IIFmulSingle, fadd>,
+  def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
                 MADDS_FM<6, 0>;
-  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32Opnd, IIFmulSingle, fsub>,
+  def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
                 MADDS_FM<7, 0>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : MADDS_FT<"madd.d", AFGR64Opnd, IIFmulDouble, fadd>,
+  def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
                  MADDS_FM<4, 1>;
-  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64Opnd, IIFmulDouble, fsub>,
+  def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
                  MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64Opnd, IIFmulDouble, fadd>,
+  def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64Opnd, IIFmulDouble, fsub>,
+  def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
                   MADDS_FM<7, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
-  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, IIFmulDouble, fadd>,
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
                  MADDS_FM<4, 1>;
-  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, IIFmulDouble, fsub>,
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
                  MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
     isCodeGenOnly=1 in {
-  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, IIFmulDouble, fadd>,
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, IIFmulDouble, fsub>,
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
                   MADDS_FM<7, 1>;
 }
 
@@ -473,8 +485,10 @@ let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
 
-def BC1F : BC1F_FT<"bc1f", IIBranch, MIPS_BRANCH_F>, BC1F_FM<0, 0>;
-def BC1T : BC1F_FT<"bc1t", IIBranch, MIPS_BRANCH_T>, BC1F_FM<0, 1>;
+def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
+           BC1F_FM<0, 0>;
+def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
+           BC1F_FM<0, 1>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -499,11 +513,11 @@ def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
 def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 
 /// Floating Point Compare
-def FCMP_S32 : CEQS_FT<"s", FGR32, IIFcmp, MipsFPCmp>, CEQS_FM<16>;
-def FCMP_D32 : CEQS_FT<"d", AFGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
+def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>;
+def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
                Requires<[NotFP64bit, HasStdEnc]>;
 let DecoderNamespace = "Mips64" in
-def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
+def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
                Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 737a018..e4405ab 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -375,7 +375,7 @@ class LUI_FM : StdArch {
   let Inst{15-0}  = imm16;
 }
 
-class JALR_FM : StdArch {
+class JALR_FM {
   bits<5> rd;
   bits<5> rs;
 
@@ -401,7 +401,7 @@ class BGEZAL_FM<bits<5> funct> : StdArch {
   let Inst{15-0}  = offset;
 }
 
-class SYNC_FM {
+class SYNC_FM : StdArch {
   bits<5> stype;
 
   bits<32> Inst;
@@ -479,11 +479,77 @@ class TEQI_FM<bits<5> funct> : StdArch {
   let Inst{20-16}   = funct;
   let Inst{15-0}  = imm16;
 }
+
+class WAIT_FM : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = 0x20;
+}
+
+class EXTS_FM<bits<6> funct> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> lenm1;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = lenm1;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class MTMR_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class POP_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEQ_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
 //===----------------------------------------------------------------------===//
 //  System calls format <op|code_|funct>
 //===----------------------------------------------------------------------===//
 
-class SYS_FM<bits<6> funct>
+class SYS_FM<bits<6> funct> : StdArch
 {
   bits<20> code_;
   bits<32> Inst;
@@ -496,7 +562,7 @@ class SYS_FM<bits<6> funct>
 //  Break instruction format <op|code_1|funct>
 //===----------------------------------------------------------------------===//
 
-class BRK_FM<bits<6> funct>
+class BRK_FM<bits<6> funct> : StdArch
 {
   bits<10> code_1;
   bits<10> code_2;
@@ -511,7 +577,7 @@ class BRK_FM<bits<6> funct>
 //  Exception return format <Cop0|1|0|funct>
 //===----------------------------------------------------------------------===//
 
-class ER_FM<bits<6> funct>
+class ER_FM<bits<6> funct> : StdArch
 {
   bits<32> Inst;
   let Inst{31-26} = 0x10;
@@ -525,7 +591,7 @@ class ER_FM<bits<6> funct>
 //  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
 //===----------------------------------------------------------------------===//
 
-class EI_FM<bits<1> sc>
+class EI_FM<bits<1> sc> : StdArch
 {
   bits<32> Inst;
   bits<5> rt;
@@ -569,7 +635,7 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
   let Inst{15-0}  = imm16;
 }
 
-class ADDS_FM<bits<6> funct, bits<5> fmt> {
+class ADDS_FM<bits<6> funct, bits<5> fmt> : StdArch {
   bits<5> fd;
   bits<5> fs;
   bits<5> ft;
@@ -584,7 +650,7 @@ class ADDS_FM<bits<6> funct, bits<5> fmt> {
   let Inst{5-0}   = funct;
 }
 
-class ABSS_FM<bits<6> funct, bits<5> fmt> {
+class ABSS_FM<bits<6> funct, bits<5> fmt> : StdArch {
   bits<5> fd;
   bits<5> fs;
 
@@ -598,7 +664,7 @@ class ABSS_FM<bits<6> funct, bits<5> fmt> {
   let Inst{5-0}   = funct;
 }
 
-class MFC1_FM<bits<5> funct> {
+class MFC1_FM<bits<5> funct> : StdArch {
   bits<5> rt;
   bits<5> fs;
 
@@ -623,7 +689,7 @@ class LW_FM<bits<6> op> : StdArch {
   let Inst{15-0}  = addr{15-0};
 }
 
-class MADDS_FM<bits<3> funct, bits<3> fmt> {
+class MADDS_FM<bits<3> funct, bits<3> fmt> : StdArch {
   bits<5> fd;
   bits<5> fr;
   bits<5> fs;
@@ -640,7 +706,7 @@ class MADDS_FM<bits<3> funct, bits<3> fmt> {
   let Inst{2-0}   = fmt;
 }
 
-class LWXC1_FM<bits<6> funct> {
+class LWXC1_FM<bits<6> funct> : StdArch {
   bits<5> fd;
   bits<5> base;
   bits<5> index;
@@ -655,7 +721,7 @@ class LWXC1_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class SWXC1_FM<bits<6> funct> {
+class SWXC1_FM<bits<6> funct> : StdArch {
   bits<5> fs;
   bits<5> base;
   bits<5> index;
@@ -670,7 +736,7 @@ class SWXC1_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class BC1F_FM<bit nd, bit tf> {
+class BC1F_FM<bit nd, bit tf> : StdArch {
   bits<3>  fcc;
   bits<16> offset;
 
@@ -684,7 +750,7 @@ class BC1F_FM<bit nd, bit tf> {
   let Inst{15-0} = offset;
 }
 
-class CEQS_FM<bits<5> fmt> {
+class CEQS_FM<bits<5> fmt> : StdArch {
   bits<5> fs;
   bits<5> ft;
   bits<4> cond;
@@ -704,7 +770,7 @@ class C_COND_FM<bits<5> fmt, bits<4> c> : CEQS_FM<fmt> {
   let cond = c;
 }
 
-class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
+class CMov_I_F_FM<bits<6> funct, bits<5> fmt> : StdArch {
   bits<5> fd;
   bits<5> fs;
   bits<5> rt;
@@ -736,7 +802,7 @@ class CMov_F_I_FM<bit tf> : StdArch {
   let Inst{5-0} = 1;
 }
 
-class CMov_F_F_FM<bits<5> fmt, bit tf> {
+class CMov_F_F_FM<bits<5> fmt, bit tf> : StdArch {
   bits<5> fd;
   bits<5> fs;
   bits<3> fcc;
@@ -752,3 +818,14 @@ class CMov_F_F_FM<bits<5> fmt, bit tf> {
   let Inst{10-6} = fd;
   let Inst{5-0} = 0x11;
 }
+
+class BARRIER_FM<bits<5> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0; // rt = 0
+  let Inst{15-11} = 0; // rd = 0
+  let Inst{10-6} = op; // Operation
+  let Inst{5-0} = 0;   // SLL
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index ebdbaa4..07c37d8 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -168,10 +168,10 @@ def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
 def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
-def NotN64      :     Predicate<"!Subtarget.isABI_N64()">,
-                      AssemblerPredicate<"!FeatureN64">;
 def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
+def HasCnMips    :    Predicate<"Subtarget.hasCnMips()">,
+                      AssemblerPredicate<"FeatureCnMips">;
 def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
                       AssemblerPredicate<"FeatureMips32">;
 def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
@@ -187,6 +187,7 @@ def NotInMicroMips :  Predicate<"!Subtarget.inMicroMipsMode()">,
                       AssemblerPredicate<"!FeatureMicroMips">;
 def IsLE           :  Predicate<"Subtarget.isLittle()">;
 def IsBE           :  Predicate<"!Subtarget.isLittle()">;
+def IsNotNaCl    :    Predicate<"!Subtarget.isTargetNaCl()">;
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStdEnc];
@@ -235,19 +236,31 @@ include "MipsInstrFormats.td"
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
+def MipsJumpTargetAsmOperand : AsmOperandClass {
+  let Name = "JumpTarget";
+  let ParserMethod = "ParseJumpTarget";
+  let PredicateMethod = "isImm";
+  let RenderMethod = "addImmOperands";
+}
+
 // Instruction operand types
 def jmptarget   : Operand<OtherVT> {
   let EncoderMethod = "getJumpTargetOpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 def brtarget    : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
   let DecoderMethod = "DecodeBranchTarget";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
+def simm10 : Operand<i32>;
+
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
@@ -265,6 +278,11 @@ def simm16_64   : Operand<i64> {
   let DecoderMethod = "DecodeSimm16";
 }
 
+// Zero
+def uimmz       : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 // Unsigned Operand
 def uimm5       : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
@@ -292,18 +310,11 @@ def MipsInvertedImmoperand : AsmOperandClass {
   let ParserMethod = "parseInvNum";
 }
 
-def PtrRegAsmOperand : AsmOperandClass {
-  let Name = "PtrReg";
-  let ParserMethod = "parsePtrReg";
-}
-
-
 def InvertedImOperand : Operand<i32> {
   let ParserMatchClass = MipsInvertedImmoperand;
 }
 
-// Address operand
-def mem : Operand<iPTR> {
+class mem_generic : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
@@ -311,6 +322,15 @@ def mem : Operand<iPTR> {
   let OperandType = "OPERAND_MEMORY";
 }
 
+// Address operand
+def mem : mem_generic;
+
+// MSA specific address operand
+def mem_msa : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm10);
+  let EncoderMethod = "getMSAMemEncoding";
+}
+
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -321,7 +341,7 @@ def mem_ea : Operand<iPTR> {
 def PtrRC : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc);
   let DecoderMethod = "DecodePtrRegisterClass";
-  let ParserMatchClass = PtrRegAsmOperand;
+  let ParserMatchClass = GPR32AsmOperand;
 }
 
 // size operand of ext instruction
@@ -349,6 +369,9 @@ def HI16 : SDNodeXForm<imm, [{
 // Plus 1.
 def Plus1 : SDNodeXForm<imm, [{ return getImm(N, N->getSExtValue() + 1); }]>;
 
+// Node immediate is zero (e.g. insve.d)
+def immz : PatLeaf<(imm), [{ return N->getSExtValue() == 0; }]>;
+
 // Node immediate fits as 16-bit sign extended on target immediate.
 // e.g. addi, andi
 def immSExt8  : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
@@ -400,6 +423,8 @@ def addrRegReg :
 def addrDefault :
   ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
 
+def addrimm10 : ComplexPattern<iPTR, 2, "selectIntAddrMSA", [frameindex]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -413,6 +438,7 @@ class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
          [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR, opstr> {
   let isCommutable = isComm;
   let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rd = $rs";
 }
 
 // Arithmetic and logical instructions with 2 register operands.
@@ -429,9 +455,9 @@ class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
 }
 
 // Arithmetic Multiply ADD/SUB
-class MArithR<string opstr, bit isComm = 0> :
+class MArithR<string opstr, InstrItinClass itin, bit isComm = 0> :
   InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-         !strconcat(opstr, "\t$rs, $rt"), [], IIImult, FrmR, opstr> {
+         !strconcat(opstr, "\t$rs, $rt"), [], itin, FrmR, opstr> {
   let Defs = [HI0, LO0];
   let Uses = [HI0, LO0];
   let isCommutable = isComm;
@@ -441,28 +467,30 @@ class MArithR<string opstr, bit isComm = 0> :
 class LogicNOR<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
-         [(set RO:$rd, (not (or RO:$rs, RO:$rt)))], IIArith, FrmR, opstr> {
+         [(set RO:$rd, (not (or RO:$rs, RO:$rt)))], II_NOR, FrmR, opstr> {
   let isCommutable = 1;
 }
 
 // Shifts
 class shift_rotate_imm<string opstr, Operand ImmOpnd,
-                       RegisterOperand RO, SDPatternOperator OpNode = null_frag,
+                       RegisterOperand RO, InstrItinClass itin,
+                       SDPatternOperator OpNode = null_frag,
                        SDPatternOperator PF = null_frag> :
   InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
          !strconcat(opstr, "\t$rd, $rt, $shamt"),
-         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], IIArith, FrmR, opstr>;
+         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr>;
 
-class shift_rotate_reg<string opstr, RegisterOperand RO,
+class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
                        SDPatternOperator OpNode = null_frag>:
   InstSE<(outs RO:$rd), (ins RO:$rt, GPR32Opnd:$rs),
          !strconcat(opstr, "\t$rd, $rt, $rs"),
-         [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], IIArith, FrmR, opstr>;
+         [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR,
+         opstr>;
 
 // Load Upper Imediate
 class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
   InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
-         [], IIArith, FrmI, opstr>, IsAsCheapAsAMove {
+         [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
   let neverHasSideEffects = 1;
   let isReMaterializable = 1;
 }
@@ -533,14 +561,14 @@ class SetCC_R<string opstr, PatFrag cond_op, RegisterOperand RO> :
   InstSE<(outs GPR32Opnd:$rd), (ins RO:$rs, RO:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
          [(set GPR32Opnd:$rd, (cond_op RO:$rs, RO:$rt))],
-         IIslt, FrmR, opstr>;
+         II_SLT_SLTU, FrmR, opstr>;
 
 class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
               RegisterOperand RO>:
   InstSE<(outs GPR32Opnd:$rt), (ins RO:$rs, Od:$imm16),
          !strconcat(opstr, "\t$rt, $rs, $imm16"),
          [(set GPR32Opnd:$rt, (cond_op RO:$rs, imm_type:$imm16))],
-         IIslt, FrmI, opstr>;
+         II_SLTI_SLTIU, FrmI, opstr>;
 
 // Jump
 class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
@@ -603,7 +631,7 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
 
   class JumpLinkReg<string opstr, RegisterOperand RO>:
     InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-           [], IIBranch, FrmR, opstr>;
+           [], IIBranch, FrmR>;
 
   class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO> :
     InstSE<(outs), (ins RO:$rs, opnd:$offset),
@@ -611,6 +639,18 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
 
 }
 
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
+    hasExtraSrcRegAllocReq = 1, Defs = [AT] in {
+  class TailCall<Instruction JumpInst> :
+    PseudoSE<(outs), (ins calltarget:$target), [], IIBranch>,
+    PseudoInstExpansion<(JumpInst jmptarget:$target)>;
+
+  class TailCallReg<RegisterOperand RO, Instruction JRInst,
+                    RegisterOperand ResRO = RO> :
+    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], IIBranch>,
+    PseudoInstExpansion<(JRInst ResRO:$rs)>;
+}
+
 class BAL_BR_Pseudo<Instruction RealInst> :
   PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>,
   PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
@@ -624,36 +664,32 @@ class BAL_BR_Pseudo<Instruction RealInst> :
 // Syscall
 class SYS_FT<string opstr> :
   InstSE<(outs), (ins uimm20:$code_),
-         !strconcat(opstr, "\t$code_"), [], NoItinerary, FrmI>;
+         !strconcat(opstr, "\t$code_"), [], NoItinerary, FrmI, opstr>;
 // Break
 class BRK_FT<string opstr> :
   InstSE<(outs), (ins uimm10:$code_1, uimm10:$code_2),
-         !strconcat(opstr, "\t$code_1, $code_2"), [], NoItinerary, FrmOther>;
+         !strconcat(opstr, "\t$code_1, $code_2"), [], NoItinerary,
+         FrmOther, opstr>;
 
 // (D)Eret
 class ER_FT<string opstr> :
   InstSE<(outs), (ins),
-         opstr, [], NoItinerary, FrmOther>;
+         opstr, [], NoItinerary, FrmOther, opstr>;
 
 // Interrupts
 class DEI_FT<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rt), (ins),
-         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther>;
+         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther, opstr>;
 
 // Wait
 class WAIT_FT<string opstr> :
-  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther> {
-  let Inst{31-26} = 0x10;
-  let Inst{25}    = 1;
-  let Inst{24-6}  = 0;
-  let Inst{5-0}   = 0x20;
-}
+  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther, opstr>;
 
 // Sync
 let hasSideEffects = 1 in
-class SYNC_FT :
+class SYNC_FT<string opstr> :
   InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
-         NoItinerary, FrmOther>;
+         NoItinerary, FrmOther, opstr>;
 
 let hasSideEffects = 1 in
 class TEQ_FT<string opstr, RegisterOperand RO> :
@@ -690,12 +726,13 @@ class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
 
 // Pseudo multiply add/sub instruction with explicit accumulator register
 // operands.
-class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
+class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode,
+                    InstrItinClass itin>
   : PseudoSE<(outs ACC64:$ac),
              (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
              [(set ACC64:$ac,
               (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
-             IIImult>,
+             itin>,
     PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
   string Constraints = "$acin = $ac";
 }
@@ -710,21 +747,22 @@ class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
 // Move from Hi/Lo
 class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
   : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
-             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], IIHiLo>;
+             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], II_MFHI_MFLO>;
 
 class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
-  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR,
-         opstr> {
+  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], II_MFHI_MFLO,
+         FrmR, opstr> {
   let Uses = [UseReg];
   let neverHasSideEffects = 1;
 }
 
 class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
   : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
-             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))], IIHiLo>;
+             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))],
+             II_MTHI_MTLO>;
 
 class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
-  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo,
+  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], II_MTHI_MTLO,
   FrmR, opstr> {
   let Defs = DefRegs;
   let neverHasSideEffects = 1;
@@ -732,7 +770,8 @@ class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
 
 class EffectiveAddress<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, addr:$addr)], NoItinerary, FrmI> {
+         [(set RO:$rt, addr:$addr)], NoItinerary, FrmI,
+         !strconcat(opstr, "_lea")> {
   let isCodeGenOnly = 1;
   let DecoderMethod = "DecodeMem";
 }
@@ -740,19 +779,19 @@ class EffectiveAddress<string opstr, RegisterOperand RO> :
 // Count Leading Ones/Zeros in Word
 class CountLeading0<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], IIArith, FrmR, opstr>,
+         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 class CountLeading1<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], IIArith, FrmR, opstr>,
+         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
-
 // Sign Extend in Register.
-class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO> :
+class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
+                   InstrItinClass itin> :
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
-         [(set RO:$rd, (sext_inreg RO:$rt, vt))], IIseb, FrmR, opstr> {
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr> {
   let Predicates = [HasSEInReg, HasStdEnc];
 }
 
@@ -767,7 +806,7 @@ class SubwordSwap<string opstr, RegisterOperand RO>:
 // Read Hardware
 class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
   InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
-         IIArith, FrmR>;
+         II_RDHWR, FrmR>;
 
 // Ext and Ins
 class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
@@ -884,7 +923,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
-def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, IIArith, immSExt16,
+def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16,
                                add>,
             ADDI_FM<0x9>, IsAsCheapAsAMove;
 def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>;
@@ -892,80 +931,82 @@ def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
 def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xb>;
-def ANDi  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, IILogic, immZExt16,
+def ANDi  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16,
                                and>,
             ADDI_FM<0xc>;
-def ORi   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, IILogic, immZExt16,
+def ORi   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
                                or>,
             ADDI_FM<0xd>;
-def XORi  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, IILogic, immZExt16,
+def XORi  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16,
                                xor>,
             ADDI_FM<0xe>;
 def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, IIArith, add>,
+def ADDu  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
             ADD_FM<0, 0x21>;
-def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, IIArith, sub>,
+def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
             ADD_FM<0, 0x23>;
 let Defs = [HI0, LO0] in
-def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, IIImul, mul>,
+def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
             ADD_FM<0x1c, 2>;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
 def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
 def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
-def AND   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, IILogic, and>,
+def AND   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
             ADD_FM<0, 0x24>;
-def OR    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, IILogic, or>,
+def OR    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
             ADD_FM<0, 0x25>;
-def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IILogic, xor>,
+def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
             ADD_FM<0, 0x26>;
 def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
-def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, shl, immZExt5>,
-           SRA_FM<0, 0>;
-def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, srl, immZExt5>,
-           SRA_FM<2, 0>;
-def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, sra, immZExt5>,
-           SRA_FM<3, 0>;
-def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, shl>, SRLV_FM<4, 0>;
-def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, srl>, SRLV_FM<6, 0>;
-def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, sra>, SRLV_FM<7, 0>;
+def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+                                   immZExt5>, SRA_FM<0, 0>;
+def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+                                   immZExt5>, SRA_FM<2, 0>;
+def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+                                   immZExt5>, SRA_FM<3, 0>;
+def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+           SRLV_FM<4, 0>;
+def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+           SRLV_FM<6, 0>;
+def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+           SRLV_FM<7, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, rotr,
-                                      immZExt5>,
-              SRA_FM<2, 1>;
-  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, rotr>,
+  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+                                      immZExt5>, SRA_FM<2, 1>;
+  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
               SRLV_FM<6, 1>;
 }
 
 /// Load and Store Instructions
 ///  aligned
-def LB  : Load<"lb", GPR32Opnd, sextloadi8, IILoad>, MMRel, LW_FM<0x20>;
-def LBu : Load<"lbu", GPR32Opnd, zextloadi8, IILoad, addrDefault>, MMRel,
+def LB  : Load<"lb", GPR32Opnd, sextloadi8, II_LB>, MMRel, LW_FM<0x20>;
+def LBu : Load<"lbu", GPR32Opnd, zextloadi8, II_LBU, addrDefault>, MMRel,
           LW_FM<0x24>;
-def LH  : Load<"lh", GPR32Opnd, sextloadi16, IILoad, addrDefault>, MMRel,
+def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
           LW_FM<0x21>;
-def LHu : Load<"lhu", GPR32Opnd, zextloadi16, IILoad>, MMRel, LW_FM<0x25>;
-def LW  : Load<"lw", GPR32Opnd, load, IILoad, addrDefault>, MMRel,
+def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
+def LW  : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
           LW_FM<0x23>;
-def SB  : Store<"sb", GPR32Opnd, truncstorei8, IIStore>, MMRel, LW_FM<0x28>;
-def SH  : Store<"sh", GPR32Opnd, truncstorei16, IIStore>, MMRel, LW_FM<0x29>;
-def SW  : Store<"sw", GPR32Opnd, store, IIStore>, MMRel, LW_FM<0x2b>;
+def SB  : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>;
+def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
+def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
 
 /// load/store left/right
 let Predicates = [NotInMicroMips] in {
-def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, IILoad>, LW_FM<0x22>;
-def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, IILoad>, LW_FM<0x26>;
-def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, IIStore>, LW_FM<0x2a>;
-def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, IIStore>, LW_FM<0x2e>;
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>;
 }
 
-def SYNC : SYNC_FT, SYNC_FM;
+def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM;
 def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
 def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
 def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
@@ -980,21 +1021,23 @@ def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>;
 def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>;
 def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>;
 
-def BREAK : BRK_FT<"break">, BRK_FM<0xd>;
-def SYSCALL : SYS_FT<"syscall">, SYS_FM<0xc>;
+def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
+def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
 def TRAP : TrapBase<BREAK>;
 
-def ERET : ER_FT<"eret">, ER_FM<0x18>;
-def DERET : ER_FT<"deret">, ER_FM<0x1f>;
+def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>;
+def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>;
 
-def EI : DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
-def DI : DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
+def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
+def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
 
-def WAIT : WAIT_FT<"wait">;
+let Predicates = [NotInMicroMips] in {
+def WAIT : WAIT_FT<"wait">, WAIT_FM;
 
 /// Load-linked, Store-conditional
 def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>;
 def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>;
+}
 
 /// Jump and Branch Instructions
 def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
@@ -1013,15 +1056,16 @@ def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
 def B       : UncondBranch<BEQ>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
-def JALR : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+let Predicates = [NotInMicroMips, HasStdEnc] in {
+def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
 def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+}
+def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>;
 def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>;
 def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
-def TAILCALL : MMRel, JumpFJ<calltarget, "j", MipsTailCall, imm, "tcall">,
-               FJ<2>, IsTailCall;
-def TAILCALL_R : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>, MTLO_FM<8>,
-                 IsTailCall;
+def TAILCALL : TailCall<J>;
+def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
 
 def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>;
 
@@ -1047,23 +1091,25 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 }
 
 /// Multiply and Divide Instructions.
-def MULT  : MMRel, Mult<"mult", IIImult, GPR32Opnd, [HI0, LO0]>,
+def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x18>;
-def MULTu : MMRel, Mult<"multu", IIImult, GPR32Opnd, [HI0, LO0]>,
+def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x19>;
-def SDIV  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x1a>;
-def UDIV  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x1b>;
 
 def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
 def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
+let Predicates = [NotInMicroMips] in {
 def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
 def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
+}
 
 /// Sign Ext In Register Instructions.
-def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM<0x10, 0x20>;
-def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM<0x18, 0x20>;
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>, SEB_FM<0x10, 0x20>;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM<0x18, 0x20>;
 
 /// Count Leading
 def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
@@ -1079,29 +1125,29 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
+def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MMRel, MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
-def MADDU : MMRel, MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
-def MSUB  : MMRel, MArithR<"msub">, MULT_FM<0x1c, 4>;
-def MSUBU : MMRel, MArithR<"msubu">, MULT_FM<0x1c, 5>;
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>;
 
 let Predicates = [HasStdEnc, NotDSP] in {
-def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, IIImult>;
-def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, IIImult>;
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>;
 def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
 def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>;
 def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>;
-def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd>;
-def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu>;
-def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub>;
-def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu>;
+def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>;
+def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>;
 }
 
-def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, IIIdiv,
+def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
                                0, 1, 1>;
-def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, IIIdiv,
+def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
                                0, 1, 1>;
 
 def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
@@ -1115,12 +1161,18 @@ def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>;
 def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
 def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
+class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
+                                      FrmOther>;
+def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
+def EHB : Barrier<"ehb">, BARRIER_FM<3>;
+def PAUSE : Barrier<"pause">, BARRIER_FM<5>, Requires<[HasMips32r2]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
 def : InstAlias<"move $dst, $src",
                 (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
-      Requires<[NotMips64]>;
+      Requires<[NotMips64, NotInMicroMips]>;
 def : InstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
 def : InstAlias<"addu $rs, $rt, $imm",
                 (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
@@ -1129,7 +1181,9 @@ def : InstAlias<"add $rs, $rt, $imm",
 def : InstAlias<"and $rs, $rt, $imm",
                 (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : InstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+let Predicates = [NotInMicroMips] in {
 def : InstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+}
 def : InstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 def : InstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
 def : InstAlias<"not $rt, $rs",
@@ -1419,3 +1473,4 @@ include "MipsMSAInstrInfo.td"
 // Micromips
 include "MicroMipsInstrFormats.td"
 include "MicroMipsInstrInfo.td"
+include "MicroMipsInstrFPU.td"
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 2efe578..2b6a874 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -134,7 +134,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
       (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
     return;
 
-  ReverseIter FirstBr = getNonDebugInstr(llvm::next(LastBr), End);
+  ReverseIter FirstBr = getNonDebugInstr(std::next(LastBr), End);
 
   // MBB has only one branch instruction if FirstBr is not a branch
   // instruction.
@@ -154,7 +154,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
   NewMBB->removeSuccessor(Tgt);
   MBB->addSuccessor(NewMBB);
   MBB->addSuccessor(Tgt);
-  MF->insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB);
+  MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
 
   NewMBB->splice(NewMBB->end(), MBB, (++LastBr).base(), MBB->end());
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index b6dfadc..7c9a9ed 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -18,10 +18,10 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
index 875dc0b..6bd0366 100644
--- a/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -15,6 +15,10 @@ class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
   let Inst{31-26} = 0b011110;
 }
 
+class MSA64Inst : MSAInst {
+  let Predicates = [HasMSA, HasMips64];
+}
+
 class MSACBranch : MSAInst {
   let Inst{31-26} = 0b010001;
 }
@@ -23,7 +27,11 @@ class MSASpecial : MSAInst {
   let Inst{31-26} = 0b000000;
 }
 
-class PseudoMSA<dag outs, dag ins, list<dag> pattern,
+class MSA64Special : MSA64Inst {
+  let Inst{31-26} = 0b000000;
+}
+
+class MSAPseudo<dag outs, dag ins, list<dag> pattern,
                 InstrItinClass itin = IIPseudo>:
   MipsPseudo<outs, ins, pattern, itin> {
   let Predicates = [HasMSA];
@@ -92,6 +100,17 @@ class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
   let Inst{5-0} = minor;
 }
 
+class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSA64Inst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
 class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
   bits<5> ws;
   bits<5> wd;
@@ -274,6 +293,19 @@ class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
   let Inst{5-0} = minor;
 }
 
+class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
 class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
   bits<6> n;
   bits<5> rs;
@@ -313,6 +345,19 @@ class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
   let Inst{5-0} = minor;
 }
 
+class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
 class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
   bits<5> imm;
   bits<5> ws;
@@ -404,3 +449,17 @@ class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
   let Inst{7-6} = sa;
   let Inst{5-0} = minor;
 }
+
+class SPECIAL_DLSA_FMT<bits<6> minor>: MSA64Special {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 82c51a6..5722c6c 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -27,6 +27,9 @@ def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
                                    SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
 def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
                                    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_INSVE : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                     SDTCisVT<2, i32>, SDTCisSameAs<0, 3>,
+                                     SDTCisVT<4, i32>]>;
 
 def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
 def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
@@ -50,6 +53,7 @@ def MipsILVL  : SDNode<"MipsISD::ILVL",  SDT_ILV>;
 def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
 def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
 def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+def MipsINSVE : SDNode<"MipsISD::INSVE", SDT_INSVE>;
 
 def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
 def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
@@ -69,7 +73,7 @@ def uimm2 : Operand<i32> {
 // as the encoded value should be subtracted by one.
 def uimm2LSAAsmOperand : AsmOperandClass {
   let Name = "LSAImm";
-  let ParserMethod = "parseLSAImm";
+  let ParserMethod = "ParseLSAImm";
   let RenderMethod = "addImmOperands";
 }
 
@@ -94,8 +98,6 @@ def uimm8 : Operand<i32> {
 
 def simm5 : Operand<i32>;
 
-def simm10 : Operand<i32>;
-
 def vsplat_uimm1 : Operand<vAny> {
   let PrintMethod = "printUnsignedImm8";
 }
@@ -137,6 +139,8 @@ def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
                                 (MipsVExtractSExt node:$vec, node:$idx, i16)>;
 def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
                                 (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+def vextract_sext_i64 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i64)>;
 
 def vextract_zext_i8  : PatFrag<(ops node:$vec, node:$idx),
                                 (MipsVExtractZExt node:$vec, node:$idx, i8)>;
@@ -144,6 +148,8 @@ def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
                                 (MipsVExtractZExt node:$vec, node:$idx, i16)>;
 def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
                                 (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+def vextract_zext_i64 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i64)>;
 
 def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
     (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
@@ -151,6 +157,17 @@ def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
     (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
 def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
     (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v2i64 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v2i64 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+def insve_v16i8 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v16i8 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v8i16 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v8i16 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v4i32 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v4i32 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v2i64 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v2i64 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
 
 class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
   PatFrag<(ops node:$lhs, node:$rhs),
@@ -232,7 +249,7 @@ def vsplati32 : PatFrag<(ops node:$e0),
                         (v4i32 (build_vector node:$e0, node:$e0,
                                              node:$e0, node:$e0))>;
 def vsplati64 : PatFrag<(ops node:$e0),
-                        (v2i64 (build_vector:$v0 node:$e0, node:$e0))>;
+                        (v2i64 (build_vector node:$e0, node:$e0))>;
 def vsplatf32 : PatFrag<(ops node:$e0),
                         (v4f32 (build_vector node:$e0, node:$e0,
                                              node:$e0, node:$e0))>;
@@ -614,10 +631,12 @@ class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
 class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
 class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
 class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>;
 
 class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
 class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
 class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+class COPY_U_D_ENC : MSA_ELM_COPY_D_FMT<0b0011, 0b011001>;
 
 class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
 
@@ -724,6 +743,7 @@ class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
 class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
 class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
 class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+class FILL_D_ENC : MSA_2R_FILL_D_FMT<0b11000000, 0b11, 0b011110>;
 
 class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
 class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
@@ -851,6 +871,7 @@ class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
 class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
 class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
 class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+class INSERT_D_ENC : MSA_ELM_INSERT_D_FMT<0b0100, 0b011001>;
 
 class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
 class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
@@ -868,6 +889,7 @@ class LDI_W_ENC  : MSA_I10_FMT<0b110, 0b10, 0b000111>;
 class LDI_D_ENC  : MSA_I10_FMT<0b110, 0b11, 0b000111>;
 
 class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+class DLSA_ENC : SPECIAL_DLSA_FMT<0b010101>;
 
 class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
 class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
@@ -1221,8 +1243,12 @@ class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
   dag OutOperandList = (outs ROWD:$wd);
   dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, vsplat_uimm8:$m);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$wd_in),
-                                               ROWS:$ws))];
+  // Note that binsxi and vselect treat the condition operand the opposite
+  // way to each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$ws),
+                                               ROWS:$wd_in))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$wd = $wd_in";
 }
@@ -1261,20 +1287,22 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   InstrItinClass Itinerary = itin;
 }
 
-class MSA_ELM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                        InstrItinClass itin = NoItinerary> {
+class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, uimm4:$n);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$n))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              immZExt4:$n))];
+  string Constraints = "$wd = $wd_in";
   InstrItinClass Itinerary = itin;
 }
 
 class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
                            RegisterClass RCD, RegisterClass RCWS> :
-      MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
-                 [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
+      MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
+                [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
   bit usesCustomInserter = 1;
 }
 
@@ -1300,17 +1328,6 @@ class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   InstrItinClass Itinerary = itin;
 }
 
-// This class is deprecated and will be removed in the next few patches
-class MSA_I8_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                         RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                         InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt8:$u8))];
-  InstrItinClass Itinerary = itin;
-}
-
 class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
                            RegisterOperand ROWS = ROWD,
                            InstrItinClass itin = NoItinerary> {
@@ -1355,8 +1372,8 @@ class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
 
 class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
                               RegisterClass RCWD, RegisterClass RCWS = RCWD> :
-      MipsPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
-                 [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+      MSAPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+                [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
   let usesCustomInserter = 1;
 }
 
@@ -1398,9 +1415,9 @@ class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                              RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
                              InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  dag InOperandList = (ins ROWS:$ws, GPR32Opnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32Opnd:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
@@ -1421,10 +1438,12 @@ class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
                            InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, GPR32Opnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              GPR32Opnd:$rt))];
   InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
 }
 
 class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -1434,8 +1453,8 @@ class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag OutOperandList = (outs ROWD:$wd);
   dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
-  list<dag> Pattern = [(set ROWD:$wd,
-                       (OpNode ROWD:$wd_in, ROWS:$ws, ROWT:$wt))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$wd = $wd_in";
 }
@@ -1479,8 +1498,8 @@ class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 
 class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
                              RegisterOperand ROWD, RegisterOperand ROFS> :
-      MipsPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6:$n, ROFS:$fs),
-                 [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
                                         immZExt6:$n))]> {
   bit usesCustomInserter = 1;
   string Constraints = "$wd = $wd_in";
@@ -1490,11 +1509,12 @@ class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
                           InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws);
-  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[0]");
+  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
   list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
                                               immZExt6:$n,
-                                              ROWS:$ws))];
+                                              ROWS:$ws,
+                                              immz:$n2))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$wd = $wd_in";
 }
@@ -1525,8 +1545,8 @@ class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
 class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
                           RegisterOperand ROWS = ROWD,
                           RegisterOperand ROWT = ROWD> :
-      MipsPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
-                 [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+      MSAPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+                [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
 
 class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
                      IsCommutable;
@@ -1735,10 +1755,14 @@ class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
 class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
 class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
 
-class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2, MSA128BOpnd>;
-class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2, MSA128HOpnd>;
-class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2, MSA128WOpnd>;
-class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2, MSA128DOpnd>;
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
 
 class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
 class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
@@ -1752,9 +1776,13 @@ class BSEL_V_DESC {
   dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
                        MSA128BOpnd:$wt);
   string AsmString = "bsel.v\t$wd, $ws, $wt";
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
   list<dag> Pattern = [(set MSA128BOpnd:$wd,
-                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
-                                                  MSA128BOpnd:$wt))];
+                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$wt,
+                                                     MSA128BOpnd:$ws))];
   InstrItinClass Itinerary = NoItinerary;
   string Constraints = "$wd = $wd_in";
 }
@@ -1764,9 +1792,13 @@ class BSELI_B_DESC {
   dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
                            vsplat_uimm8:$u8);
   string AsmString = "bseli.b\t$wd, $ws, $u8";
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
   list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
-                                                      MSA128BOpnd:$ws,
-                                                      vsplati8_uimm8:$u8))];
+                                                      vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws))];
   InstrItinClass Itinerary = NoItinerary;
   string Constraints = "$wd = $wd_in";
 }
@@ -1880,6 +1912,8 @@ class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
                                          GPR32Opnd, MSA128HOpnd>;
 class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
                                          GPR32Opnd, MSA128WOpnd>;
+class COPY_S_D_DESC : MSA_COPY_DESC_BASE<"copy_s.d", vextract_sext_i64, v2i64,
+                                         GPR64Opnd, MSA128DOpnd>;
 
 class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
                                          GPR32Opnd, MSA128BOpnd>;
@@ -1887,6 +1921,8 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
                                          GPR32Opnd, MSA128HOpnd>;
 class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
                                          GPR32Opnd, MSA128WOpnd>;
+class COPY_U_D_DESC : MSA_COPY_DESC_BASE<"copy_u.d", vextract_zext_i64, v2i64,
+                                         GPR64Opnd, MSA128DOpnd>;
 
 class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
                                                  MSA128W>;
@@ -2047,11 +2083,11 @@ class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
 class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
 let usesCustomInserter = 1 in {
   class FEXP2_W_1_PSEUDO_DESC :
-      MipsPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
-                 [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+      MSAPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+                [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
   class FEXP2_D_1_PSEUDO_DESC :
-      MipsPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
-                 [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+      MSAPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+                [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
 }
 
 class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
@@ -2086,6 +2122,8 @@ class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
                                           MSA128HOpnd, GPR32Opnd>;
 class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
                                           MSA128WOpnd, GPR32Opnd>;
+class FILL_D_DESC : MSA_2R_FILL_DESC_BASE<"fill.d", v2i64, vsplati64,
+                                          MSA128DOpnd, GPR64Opnd>;
 
 class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
                                                     FGR32>;
@@ -2259,24 +2297,26 @@ class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16,
                                            MSA128HOpnd, GPR32Opnd>;
 class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
                                            MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64,
+                                           MSA128DOpnd, GPR64Opnd>;
 
 class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
                                                      MSA128WOpnd, FGR32Opnd>;
 class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
                                                      MSA128DOpnd, FGR64Opnd>;
 
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", int_mips_insve_b,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
                                          MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", int_mips_insve_h,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
                                          MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", int_mips_insve_w,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32,
                                          MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", int_mips_insve_d,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64,
                                          MSA128DOpnd>;
 
 class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
-                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   Operand MemOpnd = mem_msa, ComplexPattern Addr = addrimm10,
                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
   dag InOperandList = (ins MemOpnd:$addr);
@@ -2296,16 +2336,21 @@ class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
 class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
 class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
 
-class LSA_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, LSAImm:$sa);
-  string AsmString = "lsa\t$rd, $rs, $rt, $sa";
-  list<dag> Pattern = [(set GPR32Opnd:$rd, (add GPR32Opnd:$rs,
-                                                (shl GPR32Opnd:$rt,
+class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
+                    RegisterOperand RORS = RORD, RegisterOperand RORT = RORD,
+                    InstrItinClass itin = NoItinerary > {
+  dag OutOperandList = (outs RORD:$rd);
+  dag InOperandList = (ins RORS:$rs, RORT:$rt, LSAImm:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
+  list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt,
+                                                (shl RORS:$rs,
                                                      immZExt2Lsa:$sa)))];
-  InstrItinClass Itinerary = NoItinerary;
+  InstrItinClass Itinerary = itin;
 }
 
+class LSA_DESC : LSA_DESC_BASE<"lsa", GPR32Opnd>;
+class DLSA_DESC : LSA_DESC_BASE<"dlsa", GPR64Opnd>;
+
 class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
                                             MSA128HOpnd>;
 class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
@@ -2502,10 +2547,14 @@ class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
 class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
 class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
 
-class SLDI_B_DESC : MSA_ELM_DESC_BASE<"sldi.b", int_mips_sldi_b, MSA128BOpnd>;
-class SLDI_H_DESC : MSA_ELM_DESC_BASE<"sldi.h", int_mips_sldi_h, MSA128HOpnd>;
-class SLDI_W_DESC : MSA_ELM_DESC_BASE<"sldi.w", int_mips_sldi_w, MSA128WOpnd>;
-class SLDI_D_DESC : MSA_ELM_DESC_BASE<"sldi.d", int_mips_sldi_d, MSA128DOpnd>;
+class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
+                                          MSA128BOpnd>;
+class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
+                                          MSA128HOpnd>;
+class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
+                                          MSA128WOpnd>;
+class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
+                                          MSA128DOpnd>;
 
 class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
 class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
@@ -2597,7 +2646,7 @@ class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
 
 class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
-                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   Operand MemOpnd = mem_msa, ComplexPattern Addr = addrimm10,
                    InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
@@ -2810,8 +2859,12 @@ def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
 def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
 
 class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
-  MipsPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
-             [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$ws, RO:$wt)))]>,
+  MSAPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+            [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$wt, RO:$ws)))]>,
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
   PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
                               MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
   let Constraints = "$wd_in = $wd";
@@ -2897,10 +2950,12 @@ def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
 def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
 def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
 def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC;
 
 def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
 def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
 def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
+def COPY_U_D : COPY_U_D_ENC, COPY_U_D_DESC;
 
 def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
 def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
@@ -3012,6 +3067,7 @@ def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
 def FILL_B : FILL_B_ENC, FILL_B_DESC;
 def FILL_H : FILL_H_ENC, FILL_H_DESC;
 def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_D : FILL_D_ENC, FILL_D_DESC;
 def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
 def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
 
@@ -3141,14 +3197,19 @@ def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
 def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
 def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
 def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+def INSERT_D : INSERT_D_ENC, INSERT_D_DESC;
 
 // INSERT_FW_PSEUDO defined after INSVE_W
 // INSERT_FD_PSEUDO defined after INSVE_D
 
-def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
-def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
-def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
-def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+// There is a fourth operand that is not present in the encoding. Use a
+// custom decoder to get a chance to add it.
+let DecoderMethod = "DecodeINSVE_DF" in {
+  def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+  def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+  def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+  def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+}
 
 def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
 def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
@@ -3164,6 +3225,7 @@ def LDI_W : LDI_W_ENC, LDI_W_DESC;
 def LDI_D : LDI_D_ENC, LDI_D_DESC;
 
 def LSA : LSA_ENC, LSA_DESC;
+def DLSA : DLSA_ENC, DLSA_DESC;
 
 def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
 def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
@@ -3464,46 +3526,23 @@ class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
 def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
              (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
 
-def : MSAPat<(v16i8 (load addr:$addr)), (LD_B addr:$addr)>;
-def : MSAPat<(v8i16 (load addr:$addr)), (LD_H addr:$addr)>;
-def : MSAPat<(v4i32 (load addr:$addr)), (LD_W addr:$addr)>;
-def : MSAPat<(v2i64 (load addr:$addr)), (LD_D addr:$addr)>;
-def : MSAPat<(v8f16 (load addr:$addr)), (LD_H addr:$addr)>;
-def : MSAPat<(v4f32 (load addr:$addr)), (LD_W addr:$addr)>;
-def : MSAPat<(v2f64 (load addr:$addr)), (LD_D addr:$addr)>;
-
-def : MSAPat<(v8f16 (load addrRegImm:$addr)), (LD_H addrRegImm:$addr)>;
-def : MSAPat<(v4f32 (load addrRegImm:$addr)), (LD_W addrRegImm:$addr)>;
-def : MSAPat<(v2f64 (load addrRegImm:$addr)), (LD_D addrRegImm:$addr)>;
-
-def : MSAPat<(store (v16i8 MSA128B:$ws), addr:$addr),
-             (ST_B MSA128B:$ws, addr:$addr)>;
-def : MSAPat<(store (v8i16 MSA128H:$ws), addr:$addr),
-             (ST_H MSA128H:$ws, addr:$addr)>;
-def : MSAPat<(store (v4i32 MSA128W:$ws), addr:$addr),
-             (ST_W MSA128W:$ws, addr:$addr)>;
-def : MSAPat<(store (v2i64 MSA128D:$ws), addr:$addr),
-             (ST_D MSA128D:$ws, addr:$addr)>;
-def : MSAPat<(store (v8f16 MSA128H:$ws), addr:$addr),
-             (ST_H MSA128H:$ws, addr:$addr)>;
-def : MSAPat<(store (v4f32 MSA128W:$ws), addr:$addr),
-             (ST_W MSA128W:$ws, addr:$addr)>;
-def : MSAPat<(store (v2f64 MSA128D:$ws), addr:$addr),
-             (ST_D MSA128D:$ws, addr:$addr)>;
-
-def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrRegImm:$addr),
-                   (ST_H MSA128H:$ws, addrRegImm:$addr)>;
-def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrRegImm:$addr),
-                   (ST_W MSA128W:$ws, addrRegImm:$addr)>;
-def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrRegImm:$addr),
-                   (ST_D MSA128D:$ws, addrRegImm:$addr)>;
+def : MSAPat<(v8f16 (load addrimm10:$addr)), (LD_H addrimm10:$addr)>;
+def : MSAPat<(v4f32 (load addrimm10:$addr)), (LD_W addrimm10:$addr)>;
+def : MSAPat<(v2f64 (load addrimm10:$addr)), (LD_D addrimm10:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrimm10:$addr),
+                   (ST_H MSA128H:$ws, addrimm10:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrimm10:$addr),
+                   (ST_W MSA128W:$ws, addrimm10:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrimm10:$addr),
+                   (ST_D MSA128D:$ws, addrimm10:$addr)>;
 
 class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
                                 RegisterOperand ROWS = ROWD,
                                 InstrItinClass itin = NoItinerary> :
-  MipsPseudo<(outs ROWD:$wd),
-             (ins ROWS:$ws),
-             [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+  MSAPseudo<(outs ROWD:$wd),
+            (ins ROWS:$ws),
+            [(set ROWD:$wd, (fabs ROWS:$ws))]> {
   InstrItinClass Itinerary = itin;
 }
 def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
@@ -3518,7 +3557,7 @@ class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
    MSAPat<(DstVT (bitconvert SrcVT:$src)),
           (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
 
-// These are endian-independant because the element size doesnt change
+// These are endian-independent because the element size doesnt change
 def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
 def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
 def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 43bf682..3e14c8c 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,16 +14,19 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
+#include "Mips16HardFloatInfo.h"
 #include "MipsSubtarget.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ValueMap.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include <map>
+#include <string>
 #include <utility>
 
 namespace llvm {
@@ -50,10 +53,9 @@ private:
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
 public:
-  MipsFunctionInfo(MachineFunction& MF)
-   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
-     VarArgsFrameIndex(0), CallsEhReturn(false)
-  {}
+  MipsFunctionInfo(MachineFunction &MF)
+      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
+        VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false) {}
 
   ~MipsFunctionInfo();
 
@@ -92,6 +94,12 @@ public:
   /// representing a GOT entry for a global function.
   MachinePointerInfo callPtrInfo(const GlobalValue *Val);
 
+  void setSaveS2() { SaveS2 = true; }
+  bool hasSaveS2() const { return SaveS2; }
+
+  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  StubsNeeded;
+
 private:
   virtual void anchor();
 
@@ -126,6 +134,9 @@ private:
   /// Frame objects for spilling eh data registers.
   int EhDataRegFI[4];
 
+  // saveS2
+  bool SaveS2;
+
   /// MipsCallEntry maps.
   StringMap<const MipsCallEntry *> ExternalCallEntries;
   ValueMap<const GlobalValue *, const MipsCallEntry *> GlobalCallEntries;
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
new file mode 100644
index 0000000..db270f3
--- /dev/null
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -0,0 +1,297 @@
+//===--------- MipsOptimizePICCall.cpp - Optimize PIC Calls ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates unnecessary instructions that set up $gp and replace
+// instructions that load target function addresses with copy instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "optimize-mips-pic-call"
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool> LoadTargetFromGOT("mips-load-target-from-got",
+                                       cl::init(true),
+                                       cl::desc("Load target address from GOT"),
+                                       cl::Hidden);
+
+static cl::opt<bool> EraseGPOpnd("mips-erase-gp-opnd",
+                                 cl::init(true), cl::desc("Erase GP Operand"),
+                                 cl::Hidden);
+
+namespace {
+typedef std::pair<unsigned, unsigned> CntRegP;
+typedef RecyclingAllocator<BumpPtrAllocator,
+                           ScopedHashTableVal<const Value *, CntRegP> >
+AllocatorTy;
+typedef ScopedHashTable<const Value *, CntRegP, DenseMapInfo<const Value *>,
+                        AllocatorTy> ScopedHTType;
+
+class MBBInfo {
+public:
+  MBBInfo(MachineDomTreeNode *N);
+  const MachineDomTreeNode *getNode() const;
+  bool isVisited() const;
+  void preVisit(ScopedHTType &ScopedHT);
+  void postVisit();
+
+private:
+  MachineDomTreeNode *Node;
+  ScopedHTType::ScopeTy *HTScope;
+};
+
+class OptimizePICCall : public MachineFunctionPass {
+public:
+  OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
+
+  virtual const char *getPassName() const { return "Mips OptimizePICCall"; }
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  /// \brief Visit MBB.
+  bool visitNode(MBBInfo &MBBI);
+
+  /// \brief Test if MI jumps to a function via a register.
+  ///
+  /// Also, return the virtual register containing the target function's address
+  /// and the underlying object in Reg and Val respectively, if the function's
+  /// address can be resolved lazily.
+  bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+                         const Value *&Val) const;
+
+  /// \brief Return the number of instructions that dominate the current
+  /// instruction and load the function address from object Entry.
+  unsigned getCount(const Value *Entry);
+
+  /// \brief Return the destination virtual register of the last instruction
+  /// that loads from object Entry.
+  unsigned getReg(const Value *Entry);
+
+  /// \brief Update ScopedHT.
+  void incCntAndSetReg(const Value *Entry, unsigned Reg);
+
+  ScopedHTType ScopedHT;
+  static char ID;
+};
+
+char OptimizePICCall::ID = 0;
+} // end of anonymous namespace
+
+/// Return the first MachineOperand of MI if it is a used virtual register.
+static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
+  if (MI.getNumOperands() == 0)
+    return 0;
+
+  MachineOperand &MO = MI.getOperand(0);
+
+  if (!MO.isReg() || !MO.isUse() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return 0;
+
+  return &MO;
+}
+
+/// Return type of register Reg.
+static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) {
+  const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+  assert(RC->vt_end() - RC->vt_begin() == 1);
+  return *RC->vt_begin();
+}
+
+/// Do the following transformation:
+///
+/// jalr $vreg
+/// =>
+/// copy $t9, $vreg
+/// jalr $t9
+static void setCallTargetReg(MachineBasicBlock *MBB,
+                             MachineBasicBlock::iterator I) {
+  MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned SrcReg = I->getOperand(0).getReg();
+  unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
+  BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)
+      .addReg(SrcReg);
+  I->getOperand(0).setReg(DstReg);
+}
+
+/// Search MI's operands for register GP and erase it.
+static void eraseGPOpnd(MachineInstr &MI) {
+  if (!EraseGPOpnd)
+    return;
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MVT::SimpleValueType Ty = getRegTy(MI.getOperand(0).getReg(), MF);
+  unsigned Reg = Ty == MVT::i32 ? Mips::GP : Mips::GP_64;
+
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    MachineOperand &MO = MI.getOperand(I);
+    if (MO.isReg() && MO.getReg() == Reg) {
+      MI.RemoveOperand(I);
+      return;
+    }
+  }
+
+  llvm_unreachable(0);
+}
+
+MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(0) {}
+
+const MachineDomTreeNode *MBBInfo::getNode() const { return Node; }
+
+bool MBBInfo::isVisited() const { return HTScope; }
+
+void MBBInfo::preVisit(ScopedHTType &ScopedHT) {
+  HTScope = new ScopedHTType::ScopeTy(ScopedHT);
+}
+
+void MBBInfo::postVisit() {
+  delete HTScope;
+}
+
+// OptimizePICCall methods.
+bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
+  if (F.getTarget().getSubtarget<MipsSubtarget>().inMips16Mode())
+    return false;
+
+  // Do a pre-order traversal of the dominator tree.
+  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  bool Changed = false;
+
+  SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode()));
+
+  while (!WorkList.empty()) {
+    MBBInfo &MBBI = WorkList.back();
+
+    // If this MBB has already been visited, destroy the scope for the MBB and
+    // pop it from the work list.
+    if (MBBI.isVisited()) {
+      MBBI.postVisit();
+      WorkList.pop_back();
+      continue;
+    }
+
+    // Visit the MBB and add its children to the work list.
+    MBBI.preVisit(ScopedHT);
+    Changed |= visitNode(MBBI);
+    const MachineDomTreeNode *Node = MBBI.getNode();
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    WorkList.append(Children.begin(), Children.end());
+  }
+
+  return Changed;
+}
+
+bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
+  bool Changed = false;
+  MachineBasicBlock *MBB = MBBI.getNode()->getBlock();
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    unsigned Reg;
+    const Value *Entry;
+
+    // Skip instructions that are not call instructions via registers.
+    if (!isCallViaRegister(*I, Reg, Entry))
+      continue;
+
+    Changed = true;
+    unsigned N = getCount(Entry);
+
+    if (N != 0) {
+      // If a function has been called more than twice, we do not have to emit a
+      // load instruction to get the function address from the GOT, but can
+      // instead reuse the address that has been loaded before.
+      if (N >= 2 && !LoadTargetFromGOT)
+        getCallTargetRegOpnd(*I)->setReg(getReg(Entry));
+
+      // Erase the $gp operand if this isn't the first time a function has
+      // been called. $gp needs to be set up only if the function call can go
+      // through a lazy binding stub.
+      eraseGPOpnd(*I);
+    }
+
+    if (Entry)
+      incCntAndSetReg(Entry, Reg);
+
+    setCallTargetReg(MBB, I);
+  }
+
+  return Changed;
+}
+
+bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+                                        const Value *&Val) const {
+  if (!MI.isCall())
+    return false;
+
+  MachineOperand *MO = getCallTargetRegOpnd(MI);
+
+  // Return if MI is not a function call via a register.
+  if (!MO)
+    return false;
+
+  // Get the instruction that loads the function address from the GOT.
+  Reg = MO->getReg();
+  Val = 0;
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = MRI.getVRegDef(Reg);
+
+  assert(DefMI);
+
+  // See if DefMI is an instruction that loads from a GOT entry that holds the
+  // address of a lazy binding stub.
+  if (!DefMI->mayLoad() || DefMI->getNumOperands() < 3)
+    return true;
+
+  unsigned Flags = DefMI->getOperand(2).getTargetFlags();
+
+  if (Flags != MipsII::MO_GOT_CALL && Flags != MipsII::MO_CALL_LO16)
+    return true;
+
+  // Return the underlying object for the GOT entry in Val.
+  assert(DefMI->hasOneMemOperand());
+  Val = (*DefMI->memoperands_begin())->getValue();
+  return true;
+}
+
+unsigned OptimizePICCall::getCount(const Value *Entry) {
+  return ScopedHT.lookup(Entry).first;
+}
+
+unsigned OptimizePICCall::getReg(const Value *Entry) {
+  unsigned Reg = ScopedHT.lookup(Entry).second;
+  assert(Reg);
+  return Reg;
+}
+
+void OptimizePICCall::incCntAndSetReg(const Value *Entry, unsigned Reg) {
+  CntRegP P = ScopedHT.lookup(Entry);
+  ScopedHT.insert(Entry, std::make_pair(P.first + 1, Reg));
+}
+
+/// Return an OptimizeCall object.
+FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) {
+  return new OptimizePICCall(TM);
+}
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 3105b02..d7fc93b 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -24,9 +24,9 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -133,6 +133,13 @@ getReservedRegs(const MachineFunction &MF) const {
   for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
     Reserved.set(ReservedGPR32[I]);
 
+  // Reserve registers for the NaCl sandbox.
+  if (Subtarget.isTargetNaCl()) {
+    Reserved.set(Mips::T6);   // Reserved for control flow mask.
+    Reserved.set(Mips::T7);   // Reserved for memory access mask.
+    Reserved.set(Mips::T8);   // Reserved for thread pointer.
+  }
+
   for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
     Reserved.set(ReservedGPR64[I]);
 
@@ -179,10 +186,13 @@ getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
+    const MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
     Reserved.set(Mips::RA);
     Reserved.set(Mips::RA_64);
     Reserved.set(Mips::T0);
     Reserved.set(Mips::T1);
+    if (MF.getFunction()->hasFnAttribute("saveS2") || MipsFI->hasSaveS2())
+      Reserved.set(Mips::S2);
   }
 
   // Reserve GP if small section is used.
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 3173d09..834e6c5 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -209,7 +209,8 @@ let Namespace = "Mips" in {
   def PC : Register<"pc">;
 
   // Hardware register $29
-  def HWR29 : MipsReg<29, "29">;
+  foreach I = 0-31 in
+  def HWR#I : MipsReg<#I, ""#I>;
 
   // Accum registers
   foreach I = 0-3 in
@@ -245,6 +246,15 @@ let Namespace = "Mips" in {
   def MSARequest : MipsReg<5, "5">;
   def MSAMap     : MipsReg<6, "6">;
   def MSAUnmap   : MipsReg<7, "7">;
+
+  // Octeon multiplier and product registers
+  def MPL0 : MipsReg<0, "mpl0">;
+  def MPL1 : MipsReg<1, "mpl1">;
+  def MPL2 : MipsReg<2, "mpl2">;
+  def P0 : MipsReg<0, "p0">;
+  def P1 : MipsReg<1, "p1">;
+  def P2 : MipsReg<2, "p2">;
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -355,7 +365,8 @@ def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
 def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
 
 // Hardware registers
-def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>, Unallocatable;
+def HWRegs : RegisterClass<"Mips", [i32], 32, (sequence "HWR%u", 0, 31)>,
+             Unallocatable;
 
 // Accumulator Registers
 def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
@@ -376,89 +387,77 @@ def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
 def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
            Unallocatable;
 
+// Octeon multiplier and product registers
+def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
+                 Unallocatable;
+def OCTEON_P : RegisterClass<"Mips", [i64], 64, (add P0, P1, P2)>,
+               Unallocatable;
+
 // Register Operands.
 
 class MipsAsmRegOperand : AsmOperandClass {
-  let RenderMethod = "addRegAsmOperands";
-}
-def GPR32AsmOperand : MipsAsmRegOperand {
-  let Name = "GPR32Asm";
-  let ParserMethod = "parseGPR32";
+  let ParserMethod = "ParseAnyRegister";
 }
 
 def GPR64AsmOperand : MipsAsmRegOperand {
-  let Name = "GPR64Asm";
-  let ParserMethod = "parseGPR64";
+  let Name = "GPR64AsmReg";
+  let PredicateMethod = "isGPRAsmReg";
 }
 
-def ACC64DSPAsmOperand : MipsAsmRegOperand {
-  let Name = "ACC64DSPAsm";
-  let ParserMethod = "parseACC64DSP";
+def GPR32AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32AsmReg";
+  let PredicateMethod = "isGPRAsmReg";
 }
 
-def LO32DSPAsmOperand : MipsAsmRegOperand {
-  let Name = "LO32DSPAsm";
-  let ParserMethod = "parseLO32DSP";
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "ACC64DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
 }
 
 def HI32DSPAsmOperand : MipsAsmRegOperand {
-  let Name = "HI32DSPAsm";
-  let ParserMethod = "parseHI32DSP";
+  let Name = "HI32DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "LO32DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
 }
 
 def CCRAsmOperand : MipsAsmRegOperand {
-  let Name = "CCRAsm";
-  let ParserMethod = "parseCCRRegs";
+  let Name = "CCRAsmReg";
 }
 
 def AFGR64AsmOperand : MipsAsmRegOperand {
-  let Name = "AFGR64Asm";
-  let ParserMethod = "parseAFGR64Regs";
+  let Name = "AFGR64AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
 }
 
 def FGR64AsmOperand : MipsAsmRegOperand {
-  let Name = "FGR64Asm";
-  let ParserMethod = "parseFGR64Regs";
+  let Name = "FGR64AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
 }
 
 def FGR32AsmOperand : MipsAsmRegOperand {
-  let Name = "FGR32Asm";
-  let ParserMethod = "parseFGR32Regs";
+  let Name = "FGR32AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
 }
 
 def FGRH32AsmOperand : MipsAsmRegOperand {
-  let Name = "FGRH32Asm";
-  let ParserMethod = "parseFGRH32Regs";
+  let Name = "FGRH32AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
 }
 
 def FCCRegsAsmOperand : MipsAsmRegOperand {
-  let Name = "FCCRegsAsm";
-  let ParserMethod = "parseFCCRegs";
-}
-
-def MSA128BAsmOperand : MipsAsmRegOperand {
-  let Name = "MSA128BAsm";
-  let ParserMethod = "parseMSA128BRegs";
-}
-
-def MSA128HAsmOperand : MipsAsmRegOperand {
-  let Name = "MSA128HAsm";
-  let ParserMethod = "parseMSA128HRegs";
-}
-
-def MSA128WAsmOperand : MipsAsmRegOperand {
-  let Name = "MSA128WAsm";
-  let ParserMethod = "parseMSA128WRegs";
+  let Name = "FCCAsmReg";
 }
 
-def MSA128DAsmOperand : MipsAsmRegOperand {
-  let Name = "MSA128DAsm";
-  let ParserMethod = "parseMSA128DRegs";
+def MSA128AsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128AsmReg";
 }
 
-def MSA128CRAsmOperand : MipsAsmRegOperand {
-  let Name = "MSA128CRAsm";
-  let ParserMethod = "parseMSA128CtrlRegs";
+def MSACtrlAsmOperand : MipsAsmRegOperand {
+  let Name = "MSACtrlAsmReg";
 }
 
 def GPR32Opnd : RegisterOperand<GPR32> {
@@ -478,13 +477,11 @@ def CCROpnd : RegisterOperand<CCR> {
 }
 
 def HWRegsAsmOperand : MipsAsmRegOperand {
-  let Name = "HWRegsAsm";
-  let ParserMethod = "parseHWRegs";
+  let Name = "HWRegsAsmReg";
 }
 
 def COP2AsmOperand : MipsAsmRegOperand {
-  let Name = "COP2Asm";
-  let ParserMethod = "parseCOP2";
+  let Name = "COP2AsmReg";
 }
 
 def HWRegsOpnd : RegisterOperand<HWRegs> {
@@ -528,22 +525,22 @@ def COP2Opnd : RegisterOperand<COP2> {
 }
 
 def MSA128BOpnd : RegisterOperand<MSA128B> {
-  let ParserMatchClass = MSA128BAsmOperand;
+  let ParserMatchClass = MSA128AsmOperand;
 }
 
 def MSA128HOpnd : RegisterOperand<MSA128H> {
-  let ParserMatchClass = MSA128HAsmOperand;
+  let ParserMatchClass = MSA128AsmOperand;
 }
 
 def MSA128WOpnd : RegisterOperand<MSA128W> {
-  let ParserMatchClass = MSA128WAsmOperand;
+  let ParserMatchClass = MSA128AsmOperand;
 }
 
 def MSA128DOpnd : RegisterOperand<MSA128D> {
-  let ParserMatchClass = MSA128DAsmOperand;
+  let ParserMatchClass = MSA128AsmOperand;
 }
 
 def MSA128CROpnd : RegisterOperand<MSACtrl> {
-  let ParserMatchClass = MSA128CRAsmOperand;
+  let ParserMatchClass = MSACtrlAsmOperand;
 }
 
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 33ed4b3..0343a47 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -299,11 +299,10 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
 
   // emit ".cfi_def_cfa_offset StackSize"
-  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
-  BuildMI(MBB, MBBI, dl,
-          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  MMI.addFrameInst(
-      MCCFIInstruction::createDefCfaOffset(AdjustSPLabel, -StackSize));
+  unsigned CFIIndex = MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
@@ -315,10 +314,6 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
 
     // Iterate over list of callee-saved registers and emit .cfi_offset
     // directives.
-    MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl,
-            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
-
     for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
            E = CSI.end(); I != E; ++I) {
       int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
@@ -335,14 +330,21 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
         if (!STI.isLittle())
           std::swap(Reg0, Reg1);
 
-        MMI.addFrameInst(
-            MCCFIInstruction::createOffset(CSLabel, Reg0, Offset));
-        MMI.addFrameInst(
-            MCCFIInstruction::createOffset(CSLabel, Reg1, Offset + 4));
+        unsigned CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+
+        CFIIndex = MMI.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
       } else {
         // Reg is either in GPR32 or FGR32.
-        MMI.addFrameInst(MCCFIInstruction::createOffset(
-            CSLabel, MRI->getDwarfRegNum(Reg, 1), Offset));
+        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
       }
     }
   }
@@ -360,13 +362,13 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
 
     // Emit .cfi_offset directives for eh data registers.
-    MCSymbol *CSLabel2 = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl,
-            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel2);
     for (int I = 0; I < 4; ++I) {
       int64_t Offset = MFI->getObjectOffset(MipsFI->getEhDataRegFI(I));
       unsigned Reg = MRI->getDwarfRegNum(ehDataReg(I), true);
-      MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel2, Reg, Offset));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
   }
 
@@ -376,11 +378,10 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
 
     // emit ".cfi_def_cfa_register $fp"
-    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl,
-            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
-    MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
-        SetFPLabel, MRI->getDwarfRegNum(FP, true)));
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+        nullptr, MRI->getDwarfRegNum(FP, true)));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
   }
 }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 737660e..5b20a6c 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -13,8 +13,8 @@
 
 #define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelDAGToDAG.h"
-#include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
@@ -24,11 +24,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -104,7 +104,7 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
   // Replace uses with ZeroReg.
   for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
        E = MRI->use_end(); U != E;) {
-    MachineOperand &MO = U.getOperand();
+    MachineOperand &MO = *U;
     unsigned OpNo = U.getOperandNo();
     MachineInstr *MI = MO.getParent();
     ++U;
@@ -248,18 +248,49 @@ SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
                               SDValue(AddCarry, 0));
 }
 
+/// Match frameindex
+bool MipsSEDAGToDAGISel::selectAddrFrameIndex(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) const {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    EVT ValTy = Addr.getValueType();
+
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    return true;
+  }
+  return false;
+}
+
+/// Match frameindex+offset and frameindex|offset
+bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base,
+                                                    SDValue &Offset,
+                                                    unsigned OffsetBits) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isIntN(OffsetBits, CN->getSExtValue())) {
+      EVT ValTy = Addr.getValueType();
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+  return false;
+}
+
 /// ComplexPattern used on MipsInstrInfo
 /// Used on Mips Load/Store instructions
 bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
                                           SDValue &Offset) const {
-  EVT ValTy = Addr.getValueType();
-
   // if Address is FI, get the TargetFrameIndex.
-  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-    Offset = CurDAG->getTargetConstant(0, ValTy);
+  if (selectAddrFrameIndex(Addr, Base, Offset))
     return true;
-  }
 
   // on PIC code Load GA
   if (Addr.getOpcode() == MipsISD::Wrapper) {
@@ -275,21 +306,8 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   }
 
   // Addresses of the form FI+const or FI|const
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (isInt<16>(CN->getSExtValue())) {
-
-      // If the first operand is a FI, get the TargetFI Node
-      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                  (Addr.getOperand(0)))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-      else
-        Base = Addr.getOperand(0);
-
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
-      return true;
-    }
-  }
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16))
+    return true;
 
   // Operand is a result from an ADD.
   if (Addr.getOpcode() == ISD::ADD) {
@@ -343,27 +361,25 @@ bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
     selectAddrDefault(Addr, Base, Offset);
 }
 
-/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
-bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+bool MipsSEDAGToDAGISel::selectAddrRegImm10(SDValue Addr, SDValue &Base,
                                             SDValue &Offset) const {
-  EVT ValTy = Addr.getValueType();
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
 
-  // Addresses of the form FI+const or FI|const
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (isInt<12>(CN->getSExtValue())) {
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10))
+    return true;
 
-      // If the first operand is a FI then get the TargetFI Node
-      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                  (Addr.getOperand(0)))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
-      else
-        Base = Addr.getOperand(0);
+  return false;
+}
 
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
-      return true;
-    }
-  }
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 12))
+    return true;
 
   return false;
 }
@@ -374,6 +390,17 @@ bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
     selectAddrDefault(Addr, Base, Offset);
 }
 
+bool MipsSEDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  if (selectAddrRegImm10(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrDefault(Addr, Base, Offset))
+    return true;
+
+  return false;
+}
+
 // Select constant vector splats.
 //
 // Returns true and sets Imm if:
@@ -630,7 +657,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   case ISD::ConstantFP: {
     ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
     if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
-      if (Subtarget.hasMips64()) {
+      if (Subtarget.isGP64bit()) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO_64, MVT::i64);
         Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index dc52064..ba84a6d 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -40,6 +40,10 @@ private:
   SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
                          SDLoc DL, SDNode *Node) const;
 
+  bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
+  bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
+                                  unsigned OffsetBits) const;
+
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
@@ -52,12 +56,18 @@ private:
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
+                                  SDValue &Offset) const;
+
   virtual bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
                                   SDValue &Offset) const;
 
   virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
                                SDValue &Offset) const;
 
+  virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   /// \brief Select constant vector splats.
   virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
   /// \brief Select constant vector splats whose value fits in a given integer.
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 809adc0..0dac0b7 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -38,7 +38,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
 
-  if (HasMips64)
+  if (isGP64bit())
     addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
 
   if (Subtarget->hasDSP() || Subtarget->hasMSA()) {
@@ -117,10 +117,14 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::MULHS,              MVT::i32, Custom);
   setOperationAction(ISD::MULHU,              MVT::i32, Custom);
 
-  if (HasMips64) {
+  if (Subtarget->hasCnMips())
+    setOperationAction(ISD::MUL,              MVT::i64, Legal);
+  else if (hasMips64())
+    setOperationAction(ISD::MUL,              MVT::i64, Custom);
+
+  if (hasMips64()) {
     setOperationAction(ISD::MULHS,            MVT::i64, Custom);
     setOperationAction(ISD::MULHU,            MVT::i64, Custom);
-    setOperationAction(ISD::MUL,              MVT::i64, Custom);
   }
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
@@ -244,7 +248,9 @@ addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
 }
 
 bool
-MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                    unsigned,
+                                                    bool *Fast) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
   switch (SVT) {
@@ -675,7 +681,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     }
 
     // Transform the DAG into an equivalent VSELECT.
-    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfClr, IfSet);
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfSet, IfClr);
   }
 
   return SDValue();
@@ -1077,14 +1083,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
             CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
-  // T9 should contain the address of the callee function if
-  // -reloction-model=pic or it is an indirect call.
-  if (IsPICCall || !GlobalOrExternal) {
-    unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
-    RegsToPass.push_front(std::make_pair(T9Reg, Callee));
-  } else
-    Ops.push_back(Callee);
-
+  Ops.push_back(Callee);
   MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
                                   InternalLinkage, CLI, Callee, Chain);
 }
@@ -1464,25 +1463,27 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_binsli_h:
   case Intrinsic::mips_binsli_w:
   case Intrinsic::mips_binsli_d: {
+    // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
     EVT VecTy = Op->getValueType(0);
     EVT EltTy = VecTy.getVectorElementType();
     APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
                                        Op->getConstantOperandVal(3));
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
-                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
-                       Op->getOperand(2));
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(2),
+                       Op->getOperand(1));
   }
   case Intrinsic::mips_binsri_b:
   case Intrinsic::mips_binsri_h:
   case Intrinsic::mips_binsri_w:
   case Intrinsic::mips_binsri_d: {
+    // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
     EVT VecTy = Op->getValueType(0);
     EVT EltTy = VecTy.getVectorElementType();
     APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
                                       Op->getConstantOperandVal(3));
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
-                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
-                       Op->getOperand(2));
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(2),
+                       Op->getOperand(1));
   }
   case Intrinsic::mips_bmnz_v:
     return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
@@ -1525,13 +1526,15 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
                        Op->getOperand(1));
   case Intrinsic::mips_bsel_v:
+    // bsel_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
     return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
-                       Op->getOperand(1), Op->getOperand(2),
-                       Op->getOperand(3));
+                       Op->getOperand(1), Op->getOperand(3),
+                       Op->getOperand(2));
   case Intrinsic::mips_bseli_b:
+    // bseli_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
     return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
-                       Op->getOperand(1), Op->getOperand(2),
-                       lowerMSASplatImm(Op, 3, DAG));
+                       Op->getOperand(1), lowerMSASplatImm(Op, 3, DAG),
+                       Op->getOperand(2));
   case Intrinsic::mips_bset_b:
   case Intrinsic::mips_bset_h:
   case Intrinsic::mips_bset_w:
@@ -1623,25 +1626,34 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_copy_s_w:
     return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
   case Intrinsic::mips_copy_s_d:
-    // Don't lower directly into VEXTRACT_SEXT_ELT since i64 might be illegal.
-    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
-    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
-                       Op->getOperand(1), Op->getOperand(2));
+    if (hasMips64())
+      // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64.
+      return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+    else {
+      // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+      // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                         Op->getValueType(0), Op->getOperand(1),
+                         Op->getOperand(2));
+    }
   case Intrinsic::mips_copy_u_b:
   case Intrinsic::mips_copy_u_h:
   case Intrinsic::mips_copy_u_w:
     return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
   case Intrinsic::mips_copy_u_d:
-    // Don't lower directly into VEXTRACT_ZEXT_ELT since i64 might be illegal.
-    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
-    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
-    //
-    // Note: When i64 is illegal, this results in copy_s.w instructions instead
-    // of copy_u.w instructions. This makes no difference to the behaviour
-    // since i64 is only illegal when the register file is 32-bit.
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
-                       Op->getOperand(1), Op->getOperand(2));
+    if (hasMips64())
+      // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64.
+      return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+    else {
+      // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+      // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+      // Note: When i64 is illegal, this results in copy_s.w instructions
+      // instead of copy_u.w instructions. This makes no difference to the
+      // behaviour since i64 is only illegal when the register file is 32-bit.
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                         Op->getValueType(0), Op->getOperand(1),
+                         Op->getOperand(2));
+    }
   case Intrinsic::mips_div_s_b:
   case Intrinsic::mips_div_s_h:
   case Intrinsic::mips_div_s_w:
@@ -1798,12 +1810,20 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_insert_d:
     return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+  case Intrinsic::mips_insve_b:
+  case Intrinsic::mips_insve_h:
+  case Intrinsic::mips_insve_w:
+  case Intrinsic::mips_insve_d:
+    return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
+                       DAG.getConstant(0, MVT::i32));
   case Intrinsic::mips_ldi_b:
   case Intrinsic::mips_ldi_h:
   case Intrinsic::mips_ldi_w:
   case Intrinsic::mips_ldi_d:
     return lowerMSASplatImm(Op, 1, DAG);
-  case Intrinsic::mips_lsa: {
+  case Intrinsic::mips_lsa:
+  case Intrinsic::mips_dlsa: {
     EVT ResTy = Op->getValueType(0);
     return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
                        DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
@@ -2553,7 +2573,14 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
   else
     llvm_unreachable("shuffle vector mask references neither vector operand?");
 
-  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op0, Op1);
+  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+  // VSHF concatenates the vectors in a bitwise fashion:
+  // <0b00, 0b01> + <0b10, 0b11> ->
+  // 0b0100       + 0b1110       -> 0b01001110
+  //                                <0b10, 0b11, 0b00, 0b01>
+  // We must therefore swap the operands to get the correct result.
+  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op1, Op0);
 }
 
 // Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
@@ -2616,7 +2643,7 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -2626,7 +2653,7 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   F->insert(It, Sink);
 
   // Transfer the remainder of BB and its successor edges to Sink.
-  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+  Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
                BB->end());
   Sink->transferSuccessorsAndUpdatePHIs(BB);
 
@@ -2681,7 +2708,7 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
   MachineFunction *F = BB->getParent();
   MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -2691,7 +2718,7 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
   F->insert(It, Sink);
 
   // Transfer the remainder of BB and its successor edges to Sink.
-  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+  Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
                BB->end());
   Sink->transferSuccessorsAndUpdatePHIs(BB);
 
@@ -2750,7 +2777,7 @@ emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
   else {
     unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
 
-    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
     BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
   }
 
@@ -2817,7 +2844,8 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
   BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
       .addReg(Wd_in)
       .addImm(Lane)
-      .addReg(Wt);
+      .addReg(Wt)
+      .addImm(0);
 
   MI->eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
@@ -2850,7 +2878,8 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
   BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
       .addReg(Wd_in)
       .addImm(Lane)
-      .addReg(Wt);
+      .addReg(Wt)
+      .addImm(0);
 
   MI->eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index c5210d9..079fbf6 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -30,7 +30,8 @@ namespace llvm {
     void addMSAFloatType(MVT::SimpleValueType Ty,
                          const TargetRegisterClass *RC);
 
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+                                               bool *Fast = 0) const override;
 
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 02931a3..094ee29 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -84,19 +84,25 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   unsigned DestReg, unsigned SrcReg,
                                   bool KillSrc) const {
   unsigned Opc = 0, ZeroReg = 0;
+  bool isMicroMips = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode();
 
   if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg.
-    if (Mips::GPR32RegClass.contains(SrcReg))
-      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
-    else if (Mips::CCRRegClass.contains(SrcReg))
+    if (Mips::GPR32RegClass.contains(SrcReg)) {
+      if (isMicroMips)
+        Opc = Mips::MOVE16_MM;
+      else
+        Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+    } else if (Mips::CCRRegClass.contains(SrcReg))
       Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
       Opc = Mips::MFC1;
-    else if (Mips::HI32RegClass.contains(SrcReg))
-      Opc = Mips::MFHI, SrcReg = 0;
-    else if (Mips::LO32RegClass.contains(SrcReg))
-      Opc = Mips::MFLO, SrcReg = 0;
-    else if (Mips::HI32DSPRegClass.contains(SrcReg))
+    else if (Mips::HI32RegClass.contains(SrcReg)) {
+      Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+      SrcReg = 0;
+    } else if (Mips::LO32RegClass.contains(SrcReg)) {
+      Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+      SrcReg = 0;
+    } else if (Mips::HI32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFHI_DSP;
     else if (Mips::LO32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFLO_DSP;
@@ -259,6 +265,8 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
 bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   MachineBasicBlock &MBB = *MI->getParent();
+  bool isMicroMips = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode();
+  unsigned Opc;
 
   switch(MI->getDesc().getOpcode()) {
   default:
@@ -267,10 +275,12 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     expandRetRA(MBB, MI, Mips::RET);
     break;
   case Mips::PseudoMFHI:
-    expandPseudoMFHiLo(MBB, MI, Mips::MFHI);
+    Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+    expandPseudoMFHiLo(MBB, MI, Opc);
     break;
   case Mips::PseudoMFLO:
-    expandPseudoMFHiLo(MBB, MI, Mips::MFLO);
+    Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+    expandPseudoMFHiLo(MBB, MI, Opc);
     break;
   case Mips::PseudoMFHI64:
     expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
@@ -481,7 +491,8 @@ void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
   DebugLoc DL = I->getDebugLoc();
   bool DstIsLarger, SrcIsLarger;
 
-  tie(DstIsLarger, SrcIsLarger) = compareOpndSize(CvtOpc, *MBB.getParent());
+  std::tie(DstIsLarger, SrcIsLarger) =
+      compareOpndSize(CvtOpc, *MBB.getParent());
 
   if (DstIsLarger)
     TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
@@ -505,9 +516,21 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
   unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
   unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
 
-  if (SubIdx == Mips::sub_hi && FP64)
-    BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg);
-  else
+  if (SubIdx == Mips::sub_hi && FP64) {
+    // FIXME: The .addReg(SrcReg, RegState::Implicit) is a white lie used to
+    //        temporarily work around a widespread bug in the -mfp64 support.
+    //        The problem is that none of the 32-bit fpu ops mention the fact
+    //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+    //        requires a major overhaul of the FPU implementation which can't
+    //        be done right now due to time constraints.
+    //        MFHC1 is one of two instructions that are affected since they are
+    //        the only instructions that don't read the lower 32-bits.
+    //        We therefore pretend that it reads the bottom 32-bits to
+    //        artificially create a dependency and prevent the scheduler
+    //        changing the behaviour of the code.
+    BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg).addReg(
+        SrcReg, RegState::Implicit);
+  } else
     BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
 }
 
@@ -530,10 +553,22 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
   BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
     .addReg(LoReg);
 
-  if (FP64)
+  if (FP64) {
+    // FIXME: The .addReg(DstReg, RegState::Implicit) is a white lie used to
+    //        temporarily work around a widespread bug in the -mfp64 support.
+    //        The problem is that none of the 32-bit fpu ops mention the fact
+    //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+    //        requires a major overhaul of the FPU implementation which can't
+    //        be done right now due to time constraints.
+    //        MTHC1 is one of two instructions that are affected since they are
+    //        the only instructions that don't read the lower 32-bits.
+    //        We therefore pretend that it reads the bottom 32-bits to
+    //        artificially create a dependency and prevent the scheduler
+    //        changing the behaviour of the code.
     BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi))
-      .addReg(HiReg);
-  else
+        .addReg(HiReg)
+        .addReg(DstReg, RegState::Implicit);
+  } else
     BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
       .addReg(HiReg);
 }
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 2d44084..2ac082f 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -24,9 +24,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
@@ -62,21 +61,42 @@ MipsSERegisterInfo::intRegClass(unsigned Size) const {
   return &Mips::GPR64RegClass;
 }
 
-/// Determine whether a given opcode is an MSA load/store (supporting 10-bit
-/// offsets) or a non-MSA load/store (supporting 16-bit offsets).
-static inline bool isMSALoadOrStore(const unsigned Opcode) {
+/// Get the size of the offset supported by the given load/store.
+/// The result includes the effects of any scale factors applied to the
+/// instruction immediate.
+static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode) {
   switch (Opcode) {
   case Mips::LD_B:
+  case Mips::ST_B:
+    return 10;
   case Mips::LD_H:
+  case Mips::ST_H:
+    return 10 + 1 /* scale factor */;
   case Mips::LD_W:
+  case Mips::ST_W:
+    return 10 + 2 /* scale factor */;
   case Mips::LD_D:
-  case Mips::ST_B:
+  case Mips::ST_D:
+    return 10 + 3 /* scale factor */;
+  default:
+    return 16;
+  }
+}
+
+/// Get the scale factor applied to the immediate in the given load/store.
+static inline unsigned getLoadStoreOffsetAlign(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_H:
   case Mips::ST_H:
+    return 2;
+  case Mips::LD_W:
   case Mips::ST_W:
+    return 4;
+  case Mips::LD_D:
   case Mips::ST_D:
-    return true;
+    return 8;
   default:
-    return false;
+    return 1;
   }
 }
 
@@ -131,13 +151,16 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   if (!MI.isDebugValue()) {
     // Make sure Offset fits within the field available.
-    // For MSA instructions, this is a 10-bit signed immediate, otherwise it is
-    // a 16-bit signed immediate.
-    unsigned OffsetBitSize = isMSALoadOrStore(MI.getOpcode()) ? 10 : 16;
-
-    if (OffsetBitSize == 10 && !isInt<10>(Offset) && isInt<16>(Offset)) {
-      // If we have an offset that needs to fit into a signed 10-bit immediate
-      // and doesn't, but does fit into 16-bits then use an ADDiu
+    // For MSA instructions, this is a 10-bit signed immediate (scaled by
+    // element size), otherwise it is a 16-bit signed immediate.
+    unsigned OffsetBitSize = getLoadStoreOffsetSizeInBits(MI.getOpcode());
+    unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
+
+    if (OffsetBitSize < 16 && isInt<16>(Offset) &&
+        (!isIntN(OffsetBitSize, Offset) ||
+         OffsetToAlignment(Offset, OffsetAlign) != 0)) {
+      // If we have an offset that needs to fit into a signed n-bit immediate
+      // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu
       MachineBasicBlock &MBB = *MI.getParent();
       DebugLoc DL = II->getDebugLoc();
       unsigned ADDiu = Subtarget.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 2779064..ea98199 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -17,60 +17,295 @@ def IMULDIV : FuncUnit;
 // Instruction Itinerary classes used for Mips
 //===----------------------------------------------------------------------===//
 def IIAlu              : InstrItinClass;
-def IIArith            : InstrItinClass;
-def IILogic            : InstrItinClass;
-def IILoad             : InstrItinClass;
-def IIStore            : InstrItinClass;
-def IIXfer             : InstrItinClass;
 def IIBranch           : InstrItinClass;
-def IIHiLo             : InstrItinClass;
-def IIImul             : InstrItinClass;
-def IIImult            : InstrItinClass;
-def IIIdiv             : InstrItinClass;
-def IIseb              : InstrItinClass;
-def IIslt              : InstrItinClass;
-def IIFcvt             : InstrItinClass;
-def IIFmove            : InstrItinClass;
-def IIFcmp             : InstrItinClass;
-def IIFadd             : InstrItinClass;
-def IIFmulSingle       : InstrItinClass;
-def IIFmulDouble       : InstrItinClass;
-def IIFdivSingle       : InstrItinClass;
-def IIFdivDouble       : InstrItinClass;
-def IIFsqrtSingle      : InstrItinClass;
-def IIFsqrtDouble      : InstrItinClass;
-def IIFrecipFsqrtStep  : InstrItinClass;
-def IIFLoad            : InstrItinClass;
-def IIFStore           : InstrItinClass;
-def IIFmoveC1          : InstrItinClass;
 def IIPseudo           : InstrItinClass;
 
+def II_ABS              : InstrItinClass;
+def II_ADDI             : InstrItinClass;
+def II_ADDIU            : InstrItinClass;
+def II_ADDU             : InstrItinClass;
+def II_ADD_D            : InstrItinClass;
+def II_ADD_S            : InstrItinClass;
+def II_AND              : InstrItinClass;
+def II_ANDI             : InstrItinClass;
+def II_BADDU            : InstrItinClass;
+def II_CEIL             : InstrItinClass;
+def II_CFC1             : InstrItinClass;
+def II_CLO              : InstrItinClass;
+def II_CLZ              : InstrItinClass;
+def II_CTC1             : InstrItinClass;
+def II_CVT              : InstrItinClass;
+def II_C_CC_D           : InstrItinClass; // Any c.<cc>.d instruction
+def II_C_CC_S           : InstrItinClass; // Any c.<cc>.s instruction
+def II_DADDIU           : InstrItinClass;
+def II_DADDU            : InstrItinClass;
+def II_DADD             : InstrItinClass;
+def II_DDIV             : InstrItinClass;
+def II_DDIVU            : InstrItinClass;
+def II_DIV              : InstrItinClass;
+def II_DIVU             : InstrItinClass;
+def II_DIV_D            : InstrItinClass;
+def II_DIV_S            : InstrItinClass;
+def II_DMFC1            : InstrItinClass;
+def II_DMTC1            : InstrItinClass;
+def II_DMUL             : InstrItinClass;
+def II_DMULT            : InstrItinClass;
+def II_DMULTU           : InstrItinClass;
+def II_DROTR            : InstrItinClass;
+def II_DROTR32          : InstrItinClass;
+def II_DROTRV           : InstrItinClass;
+def II_DSLL             : InstrItinClass;
+def II_DSLL32           : InstrItinClass;
+def II_DSLLV            : InstrItinClass;
+def II_DSRA             : InstrItinClass;
+def II_DSRA32           : InstrItinClass;
+def II_DSRAV            : InstrItinClass;
+def II_DSRL             : InstrItinClass;
+def II_DSRL32           : InstrItinClass;
+def II_DSRLV            : InstrItinClass;
+def II_DSUBU            : InstrItinClass;
+def II_DSUB             : InstrItinClass;
+def II_FLOOR            : InstrItinClass;
+def II_LB               : InstrItinClass;
+def II_LBU              : InstrItinClass;
+def II_LD               : InstrItinClass;
+def II_LDC1             : InstrItinClass;
+def II_LDL              : InstrItinClass;
+def II_LDR              : InstrItinClass;
+def II_LDXC1            : InstrItinClass;
+def II_LH               : InstrItinClass;
+def II_LHU              : InstrItinClass;
+def II_LUI              : InstrItinClass;
+def II_LUXC1            : InstrItinClass;
+def II_LW               : InstrItinClass;
+def II_LWC1             : InstrItinClass;
+def II_LWL              : InstrItinClass;
+def II_LWR              : InstrItinClass;
+def II_LWU              : InstrItinClass;
+def II_LWXC1            : InstrItinClass;
+def II_MADD             : InstrItinClass;
+def II_MADDU            : InstrItinClass;
+def II_MADD_D           : InstrItinClass;
+def II_MADD_S           : InstrItinClass;
+def II_MFC1             : InstrItinClass;
+def II_MFHC1            : InstrItinClass;
+def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
+def II_MOVF             : InstrItinClass;
+def II_MOVF_D           : InstrItinClass;
+def II_MOVF_S           : InstrItinClass;
+def II_MOVN             : InstrItinClass;
+def II_MOVN_D           : InstrItinClass;
+def II_MOVN_S           : InstrItinClass;
+def II_MOVT             : InstrItinClass;
+def II_MOVT_D           : InstrItinClass;
+def II_MOVT_S           : InstrItinClass;
+def II_MOVZ             : InstrItinClass;
+def II_MOVZ_D           : InstrItinClass;
+def II_MOVZ_S           : InstrItinClass;
+def II_MOV_D            : InstrItinClass;
+def II_MOV_S            : InstrItinClass;
+def II_MSUB             : InstrItinClass;
+def II_MSUBU            : InstrItinClass;
+def II_MSUB_D           : InstrItinClass;
+def II_MSUB_S           : InstrItinClass;
+def II_MTC1             : InstrItinClass;
+def II_MTHC1            : InstrItinClass;
+def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
+def II_MUL              : InstrItinClass;
+def II_MULT             : InstrItinClass;
+def II_MULTU            : InstrItinClass;
+def II_MUL_D            : InstrItinClass;
+def II_MUL_S            : InstrItinClass;
+def II_NEG              : InstrItinClass;
+def II_NMADD_D          : InstrItinClass;
+def II_NMADD_S          : InstrItinClass;
+def II_NMSUB_D          : InstrItinClass;
+def II_NMSUB_S          : InstrItinClass;
+def II_NOR              : InstrItinClass;
+def II_OR               : InstrItinClass;
+def II_ORI              : InstrItinClass;
+def II_POP              : InstrItinClass;
+def II_RDHWR            : InstrItinClass;
+def II_RESTORE          : InstrItinClass;
+def II_ROTR             : InstrItinClass;
+def II_ROTRV            : InstrItinClass;
+def II_ROUND            : InstrItinClass;
+def II_SAVE             : InstrItinClass;
+def II_SB               : InstrItinClass;
+def II_SD               : InstrItinClass;
+def II_SDC1             : InstrItinClass;
+def II_SDL              : InstrItinClass;
+def II_SDR              : InstrItinClass;
+def II_SDXC1            : InstrItinClass;
+def II_SEB              : InstrItinClass;
+def II_SEH              : InstrItinClass;
+def II_SEQ_SNE          : InstrItinClass; // seq and sne
+def II_SEQI_SNEI        : InstrItinClass; // seqi and snei
+def II_SH               : InstrItinClass;
+def II_SLL              : InstrItinClass;
+def II_SLLV             : InstrItinClass;
+def II_SLTI_SLTIU       : InstrItinClass; // slti and sltiu
+def II_SLT_SLTU         : InstrItinClass; // slt and sltu
+def II_SQRT_D           : InstrItinClass;
+def II_SQRT_S           : InstrItinClass;
+def II_SRA              : InstrItinClass;
+def II_SRAV             : InstrItinClass;
+def II_SRL              : InstrItinClass;
+def II_SRLV             : InstrItinClass;
+def II_SUBU             : InstrItinClass;
+def II_SUB_D            : InstrItinClass;
+def II_SUB_S            : InstrItinClass;
+def II_SUXC1            : InstrItinClass;
+def II_SW               : InstrItinClass;
+def II_SWC1             : InstrItinClass;
+def II_SWL              : InstrItinClass;
+def II_SWR              : InstrItinClass;
+def II_SWXC1            : InstrItinClass;
+def II_TRUNC            : InstrItinClass;
+def II_XOR              : InstrItinClass;
+def II_XORI             : InstrItinClass;
+
 //===----------------------------------------------------------------------===//
 // Mips Generic instruction itineraries.
 //===----------------------------------------------------------------------===//
 def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIArith            , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IILogic            , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_ADDI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDIU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AND             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRA             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROTR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLLV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRAV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRLV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROTRV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLO             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLZ             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDIU          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADD            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRA            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLLV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRLV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRAV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSUBU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSUB            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTRV          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LUI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVF            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN_S          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN_D          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NOR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_OR              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_POP             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_RDHWR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUBU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_XOR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ANDI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ORI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_XORI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LB              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LH              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LHU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LW              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LD              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_RESTORE         , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SB              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SH              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SW              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SD              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SAVE            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEQ_SNE         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEQI_SNEI       , [InstrStage<1,  [ALU]>]>,
   InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
-  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
-  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
-  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
-  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
-  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
-  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
-  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
-  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
-  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
-  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>,
-  InstrItinData<IIFLoad            , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIFStore           , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIFmoveC1          , [InstrStage<2,  [ALU]>]>
+  InstrItinData<II_DMUL            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULT           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULTU          , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MADD            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MADDU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MFHI_MFLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MTHI_MTLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MUL             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULT            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULTU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DIV             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DIVU            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DDIV            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DDIVU           , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_CEIL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CVT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ABS             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_FLOOR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NEG             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROUND           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TRUNC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOV_D           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOV_S           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVF_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVF_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVT_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVT_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVZ_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVZ_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_C_CC_S          , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_C_CC_D          , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_ADD_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_ADD_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_SUB_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_SUB_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MUL_S           , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MADD_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MSUB_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_NMADD_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_NMSUB_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MUL_D           , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MADD_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MSUB_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_NMADD_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_NMSUB_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_DIV_S           , [InstrStage<23, [ALU]>]>,
+  InstrItinData<II_DIV_D           , [InstrStage<36, [ALU]>]>,
+  InstrItinData<II_SQRT_S          , [InstrStage<54, [ALU]>]>,
+  InstrItinData<II_SQRT_D          , [InstrStage<12, [ALU]>]>,
+  InstrItinData<II_LDC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LUXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SDC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMFC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>
 ]>;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 0a81072..143b945 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -14,10 +14,10 @@
 #define DEBUG_TYPE "mips-subtarget"
 
 #include "MipsMachineFunction.h"
-#include "MipsSubtarget.h"
-#include "MipsTargetMachine.h"
 #include "Mips.h"
 #include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
@@ -55,9 +55,23 @@ Mips16HardFloat("mips16-hard-float", cl::NotHidden,
 
 static cl::opt<bool>
 Mips16ConstantIslands(
-  "mips16-constant-islands", cl::Hidden,
-  cl::desc("MIPS: mips16 constant islands enable. experimental feature"),
-  cl::init(false));
+  "mips16-constant-islands", cl::NotHidden,
+  cl::desc("MIPS: mips16 constant islands enable."),
+  cl::init(true));
+
+/// Select the Mips CPU for the given triple and cpu name.
+/// FIXME: Merge with the copy in MipsMCTargetDesc.cpp
+static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    Triple TheTriple(TT);
+    if (TheTriple.getArch() == Triple::mips ||
+        TheTriple.getArch() == Triple::mipsel)
+      CPU = "mips32";
+    else
+      CPU = "mips64";
+  }
+  return CPU;
+}
 
 void MipsSubtarget::anchor() { }
 
@@ -67,32 +81,44 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   MipsGenSubtargetInfo(TT, CPU, FS),
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
-  IsLinux(true), HasSEInReg(false), HasCondMov(false), HasSwap(false),
-  HasBitCount(false), HasFPIdx(false),
+  HasCnMips(false), IsLinux(true), HasSEInReg(false), HasCondMov(false),
+  HasSwap(false), HasBitCount(false), HasFPIdx(false),
   InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
   InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
   AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
-  RM(_RM), OverrideMode(NoOverride), TM(_TM)
+  RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT)
 {
   std::string CPUName = CPU;
-  if (CPUName.empty())
-    CPUName = "mips32";
+  CPUName = selectMipsCPU(TT, CPUName);
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
 
+  if (InMips16Mode && !TM->Options.UseSoftFloat) {
+    // Hard float for mips16 means essentially to compile as soft float
+    // but to use a runtime library for soft float that is written with
+    // native mips32 floating point instructions (those runtime routines
+    // run in mips32 hard float mode).
+    TM->Options.UseSoftFloat = true;
+    TM->Options.FloatABIType = FloatABI::Soft;
+    InMips16HardFloat = true;
+  }
+
   PreviousInMips16Mode = InMips16Mode;
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
-  // Set MipsABI if it hasn't been set yet.
-  if (MipsABI == UnknownABI)
-    MipsABI = hasMips64() ? N64 : O32;
+  // Assert exactly one ABI was chosen.
+  assert(MipsABI != UnknownABI);
+  assert((((getFeatureBits() & Mips::FeatureO32) != 0) +
+          ((getFeatureBits() & Mips::FeatureEABI) != 0) +
+          ((getFeatureBits() & Mips::FeatureN32) != 0) +
+          ((getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
 
   // Check if Architecture and ABI are compatible.
-  assert(((!hasMips64() && (isABI_O32() || isABI_EABI())) ||
-          (hasMips64() && (isABI_N32() || isABI_N64()))) &&
+  assert(((!isGP64bit() && (isABI_O32() || isABI_EABI())) ||
+          (isGP64bit() && (isABI_N32() || isABI_N64()))) &&
          "Invalid  Arch & ABI pair.");
 
   if (hasMSA() && !isFP64bit())
@@ -117,8 +143,8 @@ MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      RegClassVector &CriticalPathRCs) const {
   Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   CriticalPathRCs.clear();
-  CriticalPathRCs.push_back(hasMips64() ?
-                            &Mips::GPR64RegClass : &Mips::GPR32RegClass);
+  CriticalPathRCs.push_back(isGP64bit() ? &Mips::GPR64RegClass
+                                        : &Mips::GPR32RegClass);
   return OptLevel >= CodeGenOpt::Aggressive;
 }
 
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 6b2ab12..2166b93 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -14,11 +14,9 @@
 #ifndef MIPSSUBTARGET_H
 #define MIPSSUBTARGET_H
 
-#include "MCTargetDesc/MipsReginfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
@@ -39,10 +37,7 @@ public:
   };
 
 protected:
-
-  enum MipsArchEnum {
-    Mips32, Mips32r2, Mips64, Mips64r2
-  };
+  enum MipsArchEnum { Mips32, Mips32r2, Mips4, Mips64, Mips64r2 };
 
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
@@ -67,6 +62,9 @@ protected:
   // HasVFPU - Processor has a vector floating point unit.
   bool HasVFPU;
 
+  // CPU supports cnMIPS (Cavium Networks Octeon CPU).
+  bool HasCnMips;
+
   // isLinux - Target system is Linux. Is false we consider ELFOS for now.
   bool IsLinux;
 
@@ -118,9 +116,6 @@ protected:
 
   InstrItineraryData InstrItins;
 
-  // The instance to the register info section object
-  MipsReginfo MRI;
-
   // Relocation Model
   Reloc::Model RM;
 
@@ -130,6 +125,7 @@ protected:
 
   MipsTargetMachine *TM;
 
+  Triple TargetTriple;
 public:
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
@@ -158,6 +154,8 @@ public:
   bool hasMips64() const { return MipsArchVersion >= Mips64; }
   bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
 
+  bool hasCnMips() const { return HasCnMips; }
+
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
   bool isNotFP64bit() const { return !IsFP64bit; }
@@ -212,6 +210,9 @@ public:
 
   bool os16() const { return Os16;};
 
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isNotTargetNaCl() const { return !TargetTriple.isOSNaCl(); }
+
 // for now constant islands are on for the whole compilation unit but we only
 // really use them if in addition we are in mips16 mode
 //
@@ -219,9 +220,6 @@ static bool useConstantIslands();
 
   unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
 
-  // Grab MipsRegInfo object
-  const MipsReginfo &getMReginfo() const { return MRI; }
-
   // Grab relocation model
   Reloc::Model getRelocationModel() const {return RM;}
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 5046c1b..e9053c8 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -13,25 +13,25 @@
 
 #include "MipsTargetMachine.h"
 #include "Mips.h"
+#include "Mips16FrameLowering.h"
+#include "Mips16HardFloat.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "Mips16ISelLowering.h"
+#include "Mips16InstrInfo.h"
 #include "MipsFrameLowering.h"
 #include "MipsInstrInfo.h"
 #include "MipsModuleISelDAGToDAG.h"
 #include "MipsOs16.h"
 #include "MipsSEFrameLowering.h"
-#include "MipsSEInstrInfo.h"
-#include "MipsSEISelLowering.h"
 #include "MipsSEISelDAGToDAG.h"
-#include "Mips16FrameLowering.h"
-#include "Mips16HardFloat.h"
-#include "Mips16InstrInfo.h"
-#include "Mips16ISelDAGToDAG.h"
-#include "Mips16ISelLowering.h"
+#include "MipsSEISelLowering.h"
+#include "MipsSEInstrInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
@@ -45,8 +45,36 @@ extern "C" void LLVMInitializeMipsTarget() {
   RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget);
 }
 
-// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
-// The stack is always 8 byte aligned
+static std::string computeDataLayout(const MipsSubtarget &ST) {
+  std::string Ret = "";
+
+  // There are both little and big endian mips.
+  if (ST.isLittle())
+    Ret += "e";
+  else
+    Ret += "E";
+
+  Ret += "-m:m";
+
+  // Pointers are 32 bit on some ABIs.
+  if (!ST.isABI_N64())
+    Ret += "-p:32:32";
+
+  // 8 and 16 bit integers only need no have natural alignment, but try to
+  // align them to 32 bits. 64 bit integers have natural alignment.
+  Ret += "-i8:8:32-i16:16:32-i64:64";
+
+  // 32 bit registers are always available and the stack is at least 64 bit
+  // aligned. On N64 64 bit registers are also available and the stack is
+  // 128 bit aligned.
+  if (ST.isABI_N64() || ST.isABI_N32())
+    Ret += "-n32:64-S128";
+  else
+    Ret += "-n32-S64";
+
+  return Ret;
+}
+
 // On function prologue, the stack is created by decrementing
 // its pointer. Once decremented, all references are done with positive
 // offset from the stack/frame pointer, using StackGrowsUp enables
@@ -60,15 +88,7 @@ MipsTargetMachine(const Target &T, StringRef TT,
                   bool isLittle)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, isLittle, RM, this),
-    DL(isLittle ?
-               (Subtarget.isABI_N64() ?
-                "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-"
-                "n32:64-S128" :
-                "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32-S64") :
-               (Subtarget.isABI_N64() ?
-                "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-"
-                "n32:64-S128" :
-                "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32-S64")),
+    DL(computeDataLayout(Subtarget)),
     InstrInfo(MipsInstrInfo::create(*this)),
     FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
     TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this),
@@ -153,7 +173,11 @@ public:
 
   virtual void addIRPasses();
   virtual bool addInstSelector();
+  virtual void addMachineSSAOptimization();
   virtual bool addPreEmitPass();
+
+  virtual bool addPreRegAlloc();
+
 };
 } // namespace
 
@@ -182,6 +206,20 @@ bool MipsPassConfig::addInstSelector() {
   return false;
 }
 
+void MipsPassConfig::addMachineSSAOptimization() {
+  addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+  TargetPassConfig::addMachineSSAOptimization();
+}
+
+bool MipsPassConfig::addPreRegAlloc() {
+  if (getOptLevel() == CodeGenOpt::None) {
+    addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+    return true;
+  }
+  else
+    return false;
+}
+
 void MipsTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   if (Subtarget.allowMixed16_32()) {
     DEBUG(errs() << "No ");
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 4c748c5..13f9408 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -37,21 +37,6 @@ void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
     getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
                                ELF::SHF_WRITE |ELF::SHF_ALLOC,
                                SectionKind::getBSS());
-
-  // Register info information
-  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-  if (Subtarget.isABI_N64() || Subtarget.isABI_N32())
-    ReginfoSection =
-      getContext().getELFSection(".MIPS.options",
-                                 ELF::SHT_MIPS_OPTIONS,
-                                 ELF::SHF_ALLOC |ELF::SHF_MIPS_NOSTRIP,
-                                 SectionKind::getMetadata());
-  else
-    ReginfoSection =
-      getContext().getELFSection(".reginfo",
-                                 ELF::SHT_MIPS_REGINFO,
-                                 ELF::SHF_ALLOC,
-                                 SectionKind::getMetadata());
 }
 
 // A address must be loaded from a small section if its size is less than the
@@ -103,7 +88,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
 
 const MCSection *MipsTargetObjectFile::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                       Mangler *Mang, const TargetMachine &TM) const {
+                       Mangler &Mang, const TargetMachine &TM) const {
   // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
   // sections?
 
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index c0e9140..2bf5a75 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -17,10 +17,9 @@ namespace llvm {
   class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
     const MCSection *SmallDataSection;
     const MCSection *SmallBSSSection;
-    const MCSection *ReginfoSection;
   public:
 
-    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
 
     /// IsGlobalInSmallSection - Return true if this global address should be
@@ -31,12 +30,8 @@ namespace llvm {
                                 const TargetMachine &TM) const;
 
     const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
-                                            SectionKind Kind,
-                                            Mangler *Mang,
-                                            const TargetMachine &TM) const;
-
-    // TODO: Classify globals as mips wishes.
-    const MCSection *getReginfoSection() const { return ReginfoSection; }
+                                        SectionKind Kind, Mangler &Mang,
+                                        const TargetMachine &TM) const override;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 96966fd..5f4b74b 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -18,8 +18,33 @@ class MipsTargetStreamer : public MCTargetStreamer {
   virtual void anchor();
 
 public:
-  virtual void emitMipsHackELFFlags(unsigned Flags) = 0;
-  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) = 0;
+  MipsTargetStreamer(MCStreamer &S);
+  virtual void emitDirectiveSetMicroMips() = 0;
+  virtual void emitDirectiveSetNoMicroMips() = 0;
+  virtual void emitDirectiveSetMips16() = 0;
+  virtual void emitDirectiveSetNoMips16() = 0;
+
+  virtual void emitDirectiveSetReorder() = 0;
+  virtual void emitDirectiveSetNoReorder() = 0;
+  virtual void emitDirectiveSetMacro() = 0;
+  virtual void emitDirectiveSetNoMacro() = 0;
+  virtual void emitDirectiveSetAt() = 0;
+  virtual void emitDirectiveSetNoAt() = 0;
+  virtual void emitDirectiveEnd(StringRef Name) = 0;
+
+  virtual void emitDirectiveEnt(const MCSymbol &Symbol) = 0;
+  virtual void emitDirectiveAbiCalls() = 0;
+  virtual void emitDirectiveOptionPic0() = 0;
+  virtual void emitDirectiveOptionPic2() = 0;
+  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
+                         unsigned ReturnReg) = 0;
+  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) = 0;
+  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) = 0;
+
+  virtual void emitDirectiveSetMips32R2() = 0;
+  virtual void emitDirectiveSetMips64() = 0;
+  virtual void emitDirectiveSetMips64R2() = 0;
+  virtual void emitDirectiveSetDsp() = 0;
 };
 
 // This part is for ascii assembly output
@@ -27,18 +52,76 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer {
   formatted_raw_ostream &OS;
 
 public:
-  MipsTargetAsmStreamer(formatted_raw_ostream &OS);
-  virtual void emitMipsHackELFFlags(unsigned Flags);
-  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+  MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  virtual void emitDirectiveSetMicroMips();
+  virtual void emitDirectiveSetNoMicroMips();
+  virtual void emitDirectiveSetMips16();
+  virtual void emitDirectiveSetNoMips16();
+
+  virtual void emitDirectiveSetReorder();
+  virtual void emitDirectiveSetNoReorder();
+  virtual void emitDirectiveSetMacro();
+  virtual void emitDirectiveSetNoMacro();
+  virtual void emitDirectiveSetAt();
+  virtual void emitDirectiveSetNoAt();
+  virtual void emitDirectiveEnd(StringRef Name);
+
+  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
+  virtual void emitDirectiveAbiCalls();
+  virtual void emitDirectiveOptionPic0();
+  virtual void emitDirectiveOptionPic2();
+  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
+                         unsigned ReturnReg);
+  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
+  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
+
+  virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips64();
+  virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetDsp();
 };
 
 // This part is for ELF object output
 class MipsTargetELFStreamer : public MipsTargetStreamer {
+  bool MicroMipsEnabled;
+  const MCSubtargetInfo &STI;
+  bool Pic;
+
 public:
+  bool isMicroMipsEnabled() const { return MicroMipsEnabled; }
   MCELFStreamer &getStreamer();
-  virtual void emitMipsHackELFFlags(unsigned Flags);
-  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+  MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+  virtual void emitLabel(MCSymbol *Symbol) override;
+  virtual void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void finish() override;
+
+  virtual void emitDirectiveSetMicroMips();
+  virtual void emitDirectiveSetNoMicroMips();
+  virtual void emitDirectiveSetMips16();
+  virtual void emitDirectiveSetNoMips16();
+
+  virtual void emitDirectiveSetReorder();
+  virtual void emitDirectiveSetNoReorder();
+  virtual void emitDirectiveSetMacro();
+  virtual void emitDirectiveSetNoMacro();
+  virtual void emitDirectiveSetAt();
+  virtual void emitDirectiveSetNoAt();
+  virtual void emitDirectiveEnd(StringRef Name);
+
+  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
+  virtual void emitDirectiveAbiCalls();
+  virtual void emitDirectiveOptionPic0();
+  virtual void emitDirectiveOptionPic2();
+  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
+                         unsigned ReturnReg);
+  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
+  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
+
+  virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips64();
+  virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetDsp();
 };
 }
-
 #endif
diff --git a/lib/Target/Mips/TargetInfo/Android.mk b/lib/Target/Mips/TargetInfo/Android.mk
index 8db12d3..173d05b 100644
--- a/lib/Target/Mips/TargetInfo/Android.mk
+++ b/lib/Target/Mips/TargetInfo/Android.mk
@@ -28,6 +28,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -43,3 +44,4 @@ LOCAL_C_INCLUDES +=	$(LOCAL_PATH)/..
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
+\ No newline at end of file
diff --git a/lib/Target/Mips/TargetInfo/CMakeLists.txt b/lib/Target/Mips/TargetInfo/CMakeLists.txt
index 4172d00..3347a99 100644
--- a/lib/Target/Mips/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Mips/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMipsInfo
   MipsTargetInfo.cpp
   )
-
-add_dependencies(LLVMMipsInfo MipsCommonTableGen)
diff --git a/lib/Target/Mips/TargetInfo/LLVMBuild.txt b/lib/Target/Mips/TargetInfo/LLVMBuild.txt
index 2d42568..6235bfc 100644
--- a/lib/Target/Mips/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Mips/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MipsInfo
 parent = Mips
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = Mips
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 4f1324c..029118a 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -16,7 +16,6 @@ set(NVPTXCodeGen_sources
   NVPTXRegisterInfo.cpp
   NVPTXSubtarget.cpp
   NVPTXTargetMachine.cpp
-  NVPTXSplitBBatBar.cpp
   NVPTXLowerAggrCopies.cpp
   NVPTXutil.cpp
   NVPTXAllocaHoisting.cpp
@@ -24,14 +23,13 @@ set(NVPTXCodeGen_sources
   NVPTXUtilities.cpp
   NVVMReflect.cpp
   NVPTXGenericToNVVM.cpp
+  NVPTXAssignValidGlobalNames.cpp
   NVPTXPrologEpilogPass.cpp
   NVPTXMCExpr.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
 
-add_dependencies(LLVMNVPTXCodeGen NVPTXCommonTableGen intrinsics_gen)
-
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/NVPTX/InstPrinter/CMakeLists.txt b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
index ae4c751..bb6c8ab 100644
--- a/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
+++ b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMNVPTXAsmPrinter
   NVPTXInstPrinter.cpp
   )
-
-add_dependencies(LLVMNVPTXAsmPrinter NVPTXCommonTableGen)
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index d5be0e4..cf165be 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -13,13 +13,13 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "InstPrinter/NVPTXInstPrinter.h"
-#include "NVPTX.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include <cctype>
diff --git a/lib/Target/NVPTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt
index e2d6ed2..e805aba 100644
--- a/lib/Target/NVPTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt
@@ -28,5 +28,5 @@ has_asmprinter = 1
 type = Library
 name = NVPTXCodeGen
 parent = NVPTX
-required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXDesc NVPTXInfo SelectionDAG Support Target TransformUtils
+required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo SelectionDAG Support Target
 add_to_library_groups = NVPTX
diff --git a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
index a030d9f..dbbf235 100644
--- a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
@@ -2,8 +2,3 @@ add_llvm_library(LLVMNVPTXDesc
   NVPTXMCAsmInfo.cpp
   NVPTXMCTargetDesc.cpp
   )
-
-add_dependencies(LLVMNVPTXDesc NVPTXCommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index f2784b8..366341a 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -33,8 +33,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
 
   CommentString = "//";
 
-  PrivateGlobalPrefix = "$L__";
-
   HasSetDirective = false;
 
   HasSingleParameterDotFile = false;
@@ -49,7 +47,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
   Data16bitsDirective = " .b16 ";
   Data32bitsDirective = " .b32 ";
   Data64bitsDirective = " .b64 ";
-  PrivateGlobalPrefix = "";
   ZeroDirective = " .b8";
   AsciiDirective = " .b8";
   AscizDirective = " .b8";
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 871bac9..3cf6e4b 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXMCTargetDesc.h"
-#include "NVPTXMCAsmInfo.h"
 #include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTXMCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 490b49d..8cbdd47 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -61,6 +61,7 @@ inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
 
 FunctionPass *
 createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
+ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
 ModulePass *createNVVMReflectPass();
 ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index 6183a75..d78b4e8 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -57,12 +57,6 @@ def : Proc<"sm_35", [SM35]>;
 def NVPTXInstrInfo : InstrInfo {
 }
 
-def NVPTXAsmWriter : AsmWriter {
-  bit isMCAsmWriter = 1;
-  string AsmWriterClassName  = "InstPrinter";
-}
-
 def NVPTX : Target {
   let InstructionSet = NVPTXInstrInfo;
-  let AssemblyWriters = [NVPTXAsmWriter];
 }
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 19d73c5..22404b7 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -31,7 +31,8 @@ public:
   NVPTXAllocaHoisting() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<DataLayout>();
+    AU.addRequired<DataLayoutPass>();
+    AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 7552fe7..97e2cc6 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXAsmPrinter.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
@@ -20,19 +21,18 @@
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
-#include "InstPrinter/NVPTXInstPrinter.h"
 #include "cl_common_defines.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/DebugInfo.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/MC/MCStreamer.h"
@@ -43,7 +43,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TimeValue.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include <sstream>
 using namespace llvm;
@@ -149,7 +148,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
       std::string S;
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
-      WriteAsOperand(OS, CE, /*PrintType=*/ false,
+      CE->printAsOperand(OS, /*PrintType=*/ false,
                      !AP.MF ? 0 : AP.MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
@@ -308,7 +307,7 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   MCInst Inst;
   lowerToMCInst(MI, Inst);
-  OutStreamer.EmitInstruction(Inst);
+  EmitToStreamer(OutStreamer, Inst);
 }
 
 void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
@@ -430,7 +429,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
   O << " (";
 
   if (isABI) {
-    if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
+    if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
       unsigned size = 0;
       if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
         size = ITy->getBitWidth();
@@ -700,12 +699,11 @@ static bool usedInGlobalVarDef(const Constant *C) {
     return true;
   }
 
-  for (Value::const_use_iterator ui = C->use_begin(), ue = C->use_end();
-       ui != ue; ++ui) {
-    const Constant *C = dyn_cast<Constant>(*ui);
-    if (usedInGlobalVarDef(C))
-      return true;
-  }
+  for (const User *U : C->users())
+    if (const Constant *C = dyn_cast<Constant>(U))
+      if (usedInGlobalVarDef(C))
+        return true;
+
   return false;
 }
 
@@ -731,11 +729,10 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
                           (md->getName().str() == "llvm.dbg.sp")))
       return true;
 
-  for (User::const_use_iterator ui = U->use_begin(), ue = U->use_end();
-       ui != ue; ++ui) {
-    if (usedInOneFunc(*ui, oneFunc) == false)
+  for (const User *UU : U->users())
+    if (usedInOneFunc(UU, oneFunc) == false)
       return false;
-  }
+
   return true;
 }
 
@@ -766,12 +763,11 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
 
 static bool useFuncSeen(const Constant *C,
                         llvm::DenseMap<const Function *, bool> &seenMap) {
-  for (Value::const_use_iterator ui = C->use_begin(), ue = C->use_end();
-       ui != ue; ++ui) {
-    if (const Constant *cu = dyn_cast<Constant>(*ui)) {
+  for (const User *U : C->users()) {
+    if (const Constant *cu = dyn_cast<Constant>(U)) {
       if (useFuncSeen(cu, seenMap))
         return true;
-    } else if (const Instruction *I = dyn_cast<Instruction>(*ui)) {
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
       const BasicBlock *bb = I->getParent();
       if (!bb)
         continue;
@@ -798,10 +794,8 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
       emitDeclaration(F, O);
       continue;
     }
-    for (Value::const_use_iterator iter = F->use_begin(),
-                                   iterEnd = F->use_end();
-         iter != iterEnd; ++iter) {
-      if (const Constant *C = dyn_cast<Constant>(*iter)) {
+    for (const User *U : F->users()) {
+      if (const Constant *C = dyn_cast<Constant>(U)) {
         if (usedInGlobalVarDef(C)) {
           // The use is in the initialization of a global variable
           // that is a function pointer, so print a declaration
@@ -817,9 +811,9 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
         }
       }
 
-      if (!isa<Instruction>(*iter))
+      if (!isa<Instruction>(U))
         continue;
-      const Instruction *instr = cast<Instruction>(*iter);
+      const Instruction *instr = cast<Instruction>(U);
       const BasicBlock *bb = instr->getParent();
       if (!bb)
         continue;
@@ -844,10 +838,7 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
   DbgFinder.processModule(M);
 
   unsigned i = 1;
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-                                 E = DbgFinder.compile_unit_end();
-       I != E; ++I) {
-    DICompileUnit DIUnit(*I);
+  for (DICompileUnit DIUnit : DbgFinder.compile_units()) {
     StringRef Filename(DIUnit.getFilename());
     StringRef Dirname(DIUnit.getDirectory());
     SmallString<128> FullPathName = Dirname;
@@ -862,10 +853,7 @@ void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
     ++i;
   }
 
-  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
-                                 E = DbgFinder.subprogram_end();
-       I != E; ++I) {
-    DISubprogram SP(*I);
+  for (DISubprogram SP : DbgFinder.subprograms()) {
     StringRef Filename(SP.getFilename());
     StringRef Dirname(SP.getDirectory());
     SmallString<128> FullPathName = Dirname;
@@ -895,7 +883,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(&TM);
+  Mang = new Mangler(TM.getDataLayout());
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -1207,7 +1195,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   else
     O << " .align " << GVar->getAlignment();
 
-  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+  if (ETy->isSingleValueType()) {
     O << " .";
     // Special case: ABI requires that we use .u8 for predicates
     if (ETy->isIntegerTy(1))
@@ -1378,7 +1366,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   else
     O << " .align " << GVar->getAlignment();
 
-  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+  if (ETy->isSingleValueType()) {
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
@@ -1410,7 +1398,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
 }
 
 static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty))
+  if (Ty->isSingleValueType())
     return TD->getPrefTypeAlignment(Ty);
 
   const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
@@ -1523,8 +1511,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     }
 
     if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) {
-      if (Ty->isVectorTy()) {
-        // Just print .param .b8 .align <a> .param[size];
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
         // size = typeallocsize of element type
         unsigned align = PAL.getParamAlignment(paramIndex + 1);
@@ -1580,7 +1568,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         continue;
       }
       // Non-kernel function, just print .param .b<size> for ABI
-      // and .reg .b<size> for non ABY
+      // and .reg .b<size> for non-ABI
       unsigned sz = 0;
       if (isa<IntegerType>(Ty)) {
         sz = cast<IntegerType>(Ty)->getBitWidth();
@@ -1604,7 +1592,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     Type *ETy = PTy->getElementType();
 
     if (isABI || isKernelFunc) {
-      // Just print .param .b8 .align <a> .param[size];
+      // Just print .param .align <a> .b8 .param[size];
       // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
       unsigned align = PAL.getParamAlignment(paramIndex + 1);
@@ -2087,21 +2075,6 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *getSymbol(MO.getGlobal());
     break;
 
-  case MachineOperand::MO_ExternalSymbol: {
-    const char *symbname = MO.getSymbolName();
-    if (strstr(symbname, ".PARAM") == symbname) {
-      unsigned index;
-      sscanf(symbname + 6, "%u[];", &index);
-      printParamName(index, O);
-    } else if (strstr(symbname, ".HLPPARAM") == symbname) {
-      unsigned index;
-      sscanf(symbname + 9, "%u[];", &index);
-      O << *CurrentFnSym << "_param_" << index << "_offset";
-    } else
-      O << symbname;
-    break;
-  }
-
   case MachineOperand::MO_MachineBasicBlock:
     O << *MO.getMBB()->getSymbol();
     return;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 3abe5d1..7162420 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -27,7 +27,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetMachine.h"
 #include <fstream>
 
@@ -198,14 +197,11 @@ private:
 
   void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
 
-  void printGlobalVariable(const GlobalVariable *GVar);
   void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
                                  raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = 0);
   void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
-  // definition autogenerated.
-  void printInstruction(const MachineInstr *MI, raw_ostream &O);
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
                           bool = false);
   void printParamName(int paramIndex, raw_ostream &O);
diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
new file mode 100644
index 0000000..158c482
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -0,0 +1,84 @@
+//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Clean up the names of global variables in the module to not contain symbols
+// that are invalid in PTX.
+//
+// Currently NVPTX, like other backends, relies on generic symbol name
+// sanitizing done by MC. However, the ptxas assembler is more stringent and
+// disallows some additional characters in symbol names. This pass makes sure
+// such names do not reach MC at all.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+/// \brief NVPTXAssignValidGlobalNames
+class NVPTXAssignValidGlobalNames : public ModulePass {
+public:
+  static char ID;
+  NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
+
+  virtual bool runOnModule(Module &M);
+
+  /// \brief Clean up the name to remove symbols invalid in PTX.
+  std::string cleanUpName(StringRef Name);
+};
+}
+
+char NVPTXAssignValidGlobalNames::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names",
+                "Assign valid PTX names to globals", false, false)
+
+bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
+  for (GlobalVariable &GV : M.globals()) {
+    // We are only allowed to rename local symbols.
+    if (GV.hasLocalLinkage()) {
+      // setName doesn't do extra work if the name does not change.
+      // Note: this does not create collisions - if setName is asked to set the
+      // name to something that already exists, it adds a proper postfix to
+      // avoid collisions.
+      GV.setName(cleanUpName(GV.getName()));
+    }
+  }
+
+  return true;
+}
+
+std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
+  std::string ValidName;
+  raw_string_ostream ValidNameStream(ValidName);
+  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+    char C = Name[I];
+    if (C == '.' || C == '@') {
+      ValidNameStream << "_$_";
+    } else {
+      ValidNameStream << C;
+    }
+  }
+
+  return ValidNameStream.str();
+}
+
+ModulePass *llvm::createNVPTXAssignValidGlobalNamesPass() {
+  return new NVPTXAssignValidGlobalNames();
+}
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 9fb0dd8..45f0734 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -13,20 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
-#include "NVPTXUtilities.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
-
-#include "llvm/PassManager.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/ADT/ValueMap.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/PassManager.h"
 
 using namespace llvm;
 
@@ -64,7 +63,7 @@ private:
   GVMapTy GVMap;
   ConstantToValueMapTy ConstantToValueMap;
 };
-}
+} // end namespace
 
 char GenericToNVVM::ID = 0;
 
@@ -147,10 +146,8 @@ bool GenericToNVVM::runOnModule(Module &M) {
     // variable initializers, as other uses have been already been removed
     // while walking through the instructions in function definitions.
     for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
-         UI != UE;) {
-      Use &U = (UI++).getUse();
-      U.set(BitCastNewGV);
-    }
+         UI != UE;)
+      (UI++)->set(BitCastNewGV);
     std::string Name = GV->getName();
     GV->removeDeadConstantUsers();
     GV->eraseFromParent();
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4b8b306..bd08d2d 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -162,6 +162,9 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::StoreParamU32:
     ResNode = SelectStoreParam(N);
     break;
+  case ISD::ADDRSPACECAST:
+    ResNode = SelectAddrSpaceCast(N);
+    break;
   default:
     break;
   }
@@ -191,6 +194,66 @@ static unsigned int getCodeAddrSpace(MemSDNode *N,
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  SDValue Src = N->getOperand(0);
+  AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+  unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
+  unsigned DstAddrSpace = CastN->getDestAddressSpace();
+
+  assert(SrcAddrSpace != DstAddrSpace &&
+         "addrspacecast must be between different address spaces");
+
+  if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
+    // Specific to generic
+    unsigned Opc;
+    switch (SrcAddrSpace) {
+    default: report_fatal_error("Bad address space in addrspacecast");
+    case ADDRESS_SPACE_GLOBAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_global_yes_64
+                                : NVPTX::cvta_global_yes;
+      break;
+    case ADDRESS_SPACE_SHARED:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_shared_yes_64
+                                : NVPTX::cvta_shared_yes;
+      break;
+    case ADDRESS_SPACE_CONST:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_const_yes_64
+                                : NVPTX::cvta_const_yes;
+      break;
+    case ADDRESS_SPACE_LOCAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_local_yes_64
+                                : NVPTX::cvta_local_yes;
+      break;
+    }
+    return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+  } else {
+    // Generic to specific
+    if (SrcAddrSpace != 0)
+      report_fatal_error("Cannot cast between two non-generic address spaces");
+    unsigned Opc;
+    switch (DstAddrSpace) {
+    default: report_fatal_error("Bad address space in addrspacecast");
+    case ADDRESS_SPACE_GLOBAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_global_yes_64
+                                : NVPTX::cvta_to_global_yes;
+      break;
+    case ADDRESS_SPACE_SHARED:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+                                : NVPTX::cvta_to_shared_yes;
+      break;
+    case ADDRESS_SPACE_CONST:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_const_yes_64
+                                : NVPTX::cvta_to_const_yes;
+      break;
+    case ADDRESS_SPACE_LOCAL:
+      Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_local_yes_64
+                                : NVPTX::cvta_to_local_yes;
+      break;
+    }
+    return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+  }
+}
+
 SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -2014,7 +2077,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
   } else {
     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
-    VTs = CurDAG->getVTList(&EVTs[0], 5);
+    VTs = CurDAG->getVTList(&EVTs[0], array_lengthof(EVTs));
   }
 
   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
@@ -2440,24 +2503,3 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
   }
   return true;
 }
-
-// Return true if N is a undef or a constant.
-// If N was undef, return a (i8imm 0) in Retval
-// If N was imm, convert it to i8imm and return in Retval
-// Note: The convert to i8imm is required, otherwise the
-// pattern matcher inserts a bunch of IMOVi8rr to convert
-// the imm to i8imm, and this causes instruction selection
-// to fail.
-bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N, SDValue &Retval) {
-  if (!(N.getOpcode() == ISD::UNDEF) && !(N.getOpcode() == ISD::Constant))
-    return false;
-
-  if (N.getOpcode() == ISD::UNDEF)
-    Retval = CurDAG->getTargetConstant(0, MVT::i8);
-  else {
-    ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode());
-    unsigned retval = cn->getZExtValue();
-    Retval = CurDAG->getTargetConstant(retval, MVT::i8);
-  }
-  return true;
-}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index d961e50..93ad169 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -67,6 +67,7 @@ private:
   SDNode *SelectLoadParam(SDNode *N);
   SDNode *SelectStoreRetval(SDNode *N);
   SDNode *SelectStoreParam(SDNode *N);
+  SDNode *SelectAddrSpaceCast(SDNode *N);
         
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
@@ -91,7 +92,5 @@ private:
 
   bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
 
-  bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval);
-
 };
 }
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6a8be75..8e25a65 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -29,7 +30,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -331,8 +331,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-bool NVPTXTargetLowering::shouldSplitVectorElementType(EVT VT) const {
-  return VT == MVT::i1;
+bool NVPTXTargetLowering::shouldSplitVectorType(EVT VT) const {
+  return VT.getScalarType() == MVT::i1;
 }
 
 SDValue
@@ -361,7 +361,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
     O << "()";
   } else {
     O << "(";
-    if (retTy->isPrimitiveType() || retTy->isIntegerTy()) {
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
       unsigned size = 0;
       if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
         size = ITy->getBitWidth();
@@ -856,8 +856,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
     unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
-    if (retTy->isPrimitiveType() || retTy->isIntegerTy() ||
-        retTy->isPointerTy()) {
+    if (retTy->isSingleValueType()) {
       // Scalar needs to be at least 32bit wide
       if (resultsz < 32)
         resultsz = 32;
@@ -1259,7 +1258,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
 
     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-    // stored type to i16 and propogate the "real" type as the memory type.
+    // stored type to i16 and propagate the "real" type as the memory type.
     bool NeedExt = false;
     if (EltVT.getSizeInBits() < 16)
       NeedExt = true;
@@ -2075,7 +2074,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-  // loaded type to i16 and propogate the "real" type as the memory type.
+  // loaded type to i16 and propagate the "real" type as the memory type.
   bool NeedTrunc = false;
   if (EltVT.getSizeInBits() < 16) {
     EltVT = MVT::i16;
@@ -2162,7 +2161,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       // Since LDU/LDG are target nodes, we cannot rely on DAG type
       // legalization.
       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-      // loaded type to i16 and propogate the "real" type as the memory type.
+      // loaded type to i16 and propagate the "real" type as the memory type.
       bool NeedTrunc = false;
       if (EltVT.getSizeInBits() < 16) {
         EltVT = MVT::i16;
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 66e708f..c1e8c21 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -141,7 +141,7 @@ public:
   // PTX always uses 32-bit shift amounts
   virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
-  virtual bool shouldSplitVectorElementType(EVT VT) const;
+  virtual bool shouldSplitVectorType(EVT VT) const override;
 
 private:
   const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index b23f1e4..fbcd0e4 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -689,12 +689,24 @@ def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
                       [(set Float32Regs:$dst,
                         (fdiv Float32Regs:$a, Float32Regs:$b))]>,
                       Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.approx.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
 def FDIV32approxrr     : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, Float32Regs:$b),
                       "div.approx.f32 \t$dst, $a, $b;",
                       [(set Float32Regs:$dst,
                         (fdiv Float32Regs:$a, Float32Regs:$b))]>,
                       Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.approx.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
 //
 // F32 Semi-accurate reciprocal
 //
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 7c257b4..f0c3663 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -16,12 +16,12 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/InstIterator.h"
 
 using namespace llvm;
 
@@ -104,7 +104,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   SmallVector<MemTransferInst *, 4> aggrMemcpys;
   SmallVector<MemSetInst *, 4> aggrMemsets;
 
-  DataLayout *TD = &getAnalysis<DataLayout>();
+  const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
   LLVMContext &Context = F.getParent()->getContext();
 
   //
@@ -120,10 +120,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
         if (load->hasOneUse() == false)
           continue;
 
-        if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize)
+        if (DL->getTypeStoreSize(load->getType()) < MaxAggrCopySize)
           continue;
 
-        User *use = *(load->use_begin());
+        User *use = load->user_back();
         if (StoreInst *store = dyn_cast<StoreInst>(use)) {
           if (store->getOperand(0) != load) //getValueOperand
             continue;
@@ -163,10 +163,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   //
   for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
     LoadInst *load = aggrLoads[i];
-    StoreInst *store = dyn_cast<StoreInst>(*load->use_begin());
+    StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
     Value *srcAddr = load->getOperand(0);
     Value *dstAddr = store->getOperand(1);
-    unsigned numLoads = TD->getTypeStoreSize(load->getType());
+    unsigned numLoads = DL->getTypeStoreSize(load->getType());
     Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
 
     convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index 286e753..c9aa87d 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -28,7 +28,8 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
   NVPTXLowerAggrCopies() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<DataLayout>();
+    AU.addRequired<DataLayoutPass>();
+    AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 843ebed..d5b042a 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -14,14 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
-#include "llvm/Pass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
deleted file mode 100644
index b64c308..0000000
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier  --*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Split basic blocks so that a basic block that contains a barrier instruction
-// only contains the barrier instruction.
-//
-//===----------------------------------------------------------------------===//
-
-#include "NVPTXSplitBBatBar.h"
-#include "NVPTXUtilities.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/InstIterator.h"
-
-using namespace llvm;
-
-namespace llvm { FunctionPass *createSplitBBatBarPass(); }
-
-char NVPTXSplitBBatBar::ID = 0;
-
-bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
-
-  SmallVector<Instruction *, 4> SplitPoints;
-  bool changed = false;
-
-  // Collect all the split points in SplitPoints
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    BasicBlock::iterator IB = BI->begin();
-    BasicBlock::iterator II = IB;
-    BasicBlock::iterator IE = BI->end();
-
-    // Skit the first instruction. No splitting is needed at this
-    // point even if this is a bar.
-    while (II != IE) {
-      if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
-        Intrinsic::ID id = inst->getIntrinsicID();
-        // If this is a barrier, split at this instruction
-        // and the next instruction.
-        if (llvm::isBarrierIntrinsic(id)) {
-          if (II != IB)
-            SplitPoints.push_back(II);
-          II++;
-          if ((II != IE) && (!II->isTerminator())) {
-            SplitPoints.push_back(II);
-            II++;
-          }
-          continue;
-        }
-      }
-      II++;
-    }
-  }
-
-  for (unsigned i = 0; i != SplitPoints.size(); i++) {
-    changed = true;
-    Instruction *inst = SplitPoints[i];
-    inst->getParent()->splitBasicBlock(inst, "bar_split");
-  }
-
-  return changed;
-}
-
-// This interface will most likely not be necessary, because this pass will
-// not be invoked by the driver, but will be used as a prerequisite to
-// another pass.
-FunctionPass *llvm::createSplitBBatBarPass() { return new NVPTXSplitBBatBar(); }
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.h b/lib/Target/NVPTX/NVPTXSplitBBatBar.h
deleted file mode 100644
index bdafba9..0000000
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the NVIDIA specific declarations
-// for splitting basic blocks at barrier instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef NVPTX_SPLIT_BB_AT_BAR_H
-#define NVPTX_SPLIT_BB_AT_BAR_H
-
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/Pass.h"
-
-namespace llvm {
-
-// actual analysis class, which is a functionpass
-struct NVPTXSplitBBatBar : public FunctionPass {
-  static char ID;
-
-  NVPTXSplitBBatBar() : FunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addPreserved<MachineFunctionAnalysis>();
-  }
-  virtual bool runOnFunction(Function &F);
-
-  virtual const char *getPassName() const {
-    return "Split basic blocks at barrier";
-  }
-};
-
-extern FunctionPass *createSplitBBatBarPass();
-}
-
-#endif //NVPTX_SPLIT_BB_AT_BAR_H
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 004be11..f99bebd 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -74,21 +74,6 @@ public:
   unsigned getPTXVersion() const { return PTXVersion; }
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  std::string getDataLayout() const {
-    const char *p;
-    if (is64Bit())
-      p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
-          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
-          "n16:32:64";
-    else
-      p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
-          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
-          "n16:32:64";
-
-    return std::string(p);
-  }
-
 };
 
 } // End llvm namespace
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 46edd6d..7d7d793 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -16,16 +16,15 @@
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
-#include "NVPTXSplitBBatBar.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -50,6 +49,7 @@ using namespace llvm;
 namespace llvm {
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -61,6 +61,18 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
   initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
+}
+
+static std::string computeDataLayout(const NVPTXSubtarget &ST) {
+  std::string Ret = "e";
+
+  if (!ST.is64Bit())
+    Ret += "-p:32:32";
+
+  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+  return Ret;
 }
 
 NVPTXTargetMachine::NVPTXTargetMachine(
@@ -68,7 +80,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(
     const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
     CodeGenOpt::Level OL, bool is64bit)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, is64bit), DL(Subtarget.getDataLayout()),
+      Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)),
       InstrInfo(*this), TLInfo(*this), TSInfo(*this),
       FrameLowering(
           *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
@@ -106,7 +118,7 @@ public:
   virtual bool addPreRegAlloc();
   virtual bool addPostRegAlloc();
 
-  virtual FunctionPass *createTargetRegisterAllocator(bool) LLVM_OVERRIDE;
+  virtual FunctionPass *createTargetRegisterAllocator(bool) override;
   virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
   virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
 };
@@ -129,12 +141,12 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&TailDuplicateID);
 
   TargetPassConfig::addIRPasses();
+  addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
 }
 
 bool NVPTXPassConfig::addInstSelector() {
   addPass(createLowerAggrCopies());
-  addPass(createSplitBBatBarPass());
   addPass(createAllocaHoisting());
   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
   return false;
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 2a7394b..2a7281e 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -46,7 +46,7 @@ public:
 
   virtual ~NVPTXTargetObjectFile();
 
-  virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override {
     TargetLoweringObjectFile::Initialize(ctx, TM);
     TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
     DataSection =
@@ -87,13 +87,13 @@ public:
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
   }
 
-  virtual const MCSection *getSectionForConstant(SectionKind Kind) const {
+  const MCSection *getSectionForConstant(SectionKind Kind) const override {
     return ReadOnlySection;
   }
 
-  virtual const MCSection *
-  getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
-                           Mangler *Mang, const TargetMachine &TM) const {
+  const MCSection *getExplicitSectionGlobal(const GlobalValue *GV,
+                                       SectionKind Kind, Mangler &Mang,
+                                       const TargetMachine &TM) const override {
     return DataSection;
   }
 
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 6786eb0..60a5173 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -24,7 +24,7 @@
 #include <vector>
 //#include <iostream>
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/InstIterator.h"
+#include "llvm/IR/InstIterator.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 7406207..8b5444a 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass replaces occurences of __nvvm_reflect("string") with an
+// This pass replaces occurrences of __nvvm_reflect("string") with an
 // integer based on -nvvm-reflect-list string=<int> option given to this pass.
 // If an undefined string value is seen in a call to __nvvm_reflect("string"),
 // a default value of 0 will be used.
@@ -18,13 +18,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Pass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_os_ostream.h"
@@ -84,7 +84,7 @@ NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
 
 char NVVMReflect::ID = 0;
 INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
-                "Replace occurences of __nvvm_reflect() calls with 0/1", false,
+                "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
 static cl::list<std::string>
@@ -143,11 +143,9 @@ bool NVVMReflect::runOnModule(Module &M) {
   // ConstantArray can be found successfully, see if it can be
   // found in VarMap. If so, replace the uses of CallInst with the
   // value found in VarMap. If not, replace the use  with value 0.
-  for (Value::use_iterator I = ReflectFunction->use_begin(),
-                           E = ReflectFunction->use_end();
-       I != E; ++I) {
-    assert(isa<CallInst>(*I) && "Only a call instruction can use _reflect");
-    CallInst *Reflect = cast<CallInst>(*I);
+  for (User *U : ReflectFunction->users()) {
+    assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
+    CallInst *Reflect = cast<CallInst>(U);
 
     assert((Reflect->getNumOperands() == 2) &&
            "Only one operand expect for _reflect function");
diff --git a/lib/Target/NVPTX/TargetInfo/CMakeLists.txt b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
index 0bf1334..1beb40e 100644
--- a/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
+++ b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-#include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMNVPTXInfo
   NVPTXTargetInfo.cpp
   )
-
-add_dependencies(LLVMNVPTXInfo NVPTXCommonTableGen)
diff --git a/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
index ef12b0e..af7ec27 100644
--- a/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = NVPTXInfo
 parent = NVPTX
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = NVPTX
diff --git a/lib/Target/PowerPC/AsmParser/CMakeLists.txt b/lib/Target/PowerPC/AsmParser/CMakeLists.txt
index 3aa59c0..408858e 100644
--- a/lib/Target/PowerPC/AsmParser/CMakeLists.txt
+++ b/lib/Target/PowerPC/AsmParser/CMakeLists.txt
@@ -1,8 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/..
-                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMPowerPCAsmParser
   PPCAsmParser.cpp
   )
-
-add_dependencies(LLVMPowerPCAsmParser PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index fe83fe1..8bb91cf 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -9,21 +9,22 @@
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCMCExpr.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/MC/MCStreamer.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -94,6 +95,44 @@ static unsigned VRegs[32] = {
   PPC::V24, PPC::V25, PPC::V26, PPC::V27,
   PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };
+static unsigned VSRegs[64] = {
+  PPC::VSL0,  PPC::VSL1,  PPC::VSL2,  PPC::VSL3,
+  PPC::VSL4,  PPC::VSL5,  PPC::VSL6,  PPC::VSL7,
+  PPC::VSL8,  PPC::VSL9,  PPC::VSL10, PPC::VSL11,
+  PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+  PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+  PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+  PPC::VSH0,  PPC::VSH1,  PPC::VSH2,  PPC::VSH3,
+  PPC::VSH4,  PPC::VSH5,  PPC::VSH6,  PPC::VSH7,
+  PPC::VSH8,  PPC::VSH9,  PPC::VSH10, PPC::VSH11,
+  PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15,
+  PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19,
+  PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23,
+  PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
+  PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
+};
+static unsigned VSFRegs[64] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
+  PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
+  PPC::VF8,  PPC::VF9,  PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
 static unsigned CRBITRegs[32] = {
   PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
   PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
@@ -177,6 +216,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
   const MCInstrInfo &MII;
   bool IsPPC64;
+  bool IsDarwin;
 
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
@@ -185,6 +225,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
 
   bool isPPC64() const { return IsPPC64; }
+  bool isDarwin() const { return IsDarwin; }
 
   bool MatchRegisterName(const AsmToken &Tok,
                          unsigned &RegNo, int64_t &IntVal);
@@ -195,12 +236,14 @@ class PPCAsmParser : public MCTargetAsmParser {
                                         PPCMCExpr::VariantKind &Variant);
   const MCExpr *FixupVariantKind(const MCExpr *E);
   bool ParseExpression(const MCExpr *&EVal);
+  bool ParseDarwinExpression(const MCExpr *&EVal);
 
   bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveTC(unsigned Size, SMLoc L);
   bool ParseDirectiveMachine(SMLoc L);
+  bool ParseDarwinDirectiveMachine(SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -227,6 +270,7 @@ public:
     Triple TheTriple(STI.getTargetTriple());
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
                TheTriple.getArch() == Triple::ppc64le);
+    IsDarwin = TheTriple.isMacOSX();
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
@@ -339,6 +383,11 @@ public:
     return (unsigned) Imm.Val;
   }
 
+  unsigned getVSReg() const {
+    assert(isVSRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
   unsigned getCCReg() const {
     assert(isCCRegNumber() && "Invalid access!");
     return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
@@ -356,6 +405,7 @@ public:
 
   bool isToken() const { return Kind == Token; }
   bool isImm() const { return Kind == Immediate || Kind == Expression; }
+  bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
   bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
   bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
   bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
@@ -376,6 +426,7 @@ public:
                                  (Kind == Immediate && isInt<16>(getImm()) &&
                                   (getImm() & 3) == 0); }
   bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isVSRegNumber() const { return Kind == Immediate && isUInt<6>(getImm()); }
   bool isCCRegNumber() const { return (Kind == Expression
                                        && isUInt<3>(getExprCRVal())) ||
                                       (Kind == Immediate
@@ -442,6 +493,16 @@ public:
     Inst.addOperand(MCOperand::CreateReg(VRegs[getReg()]));
   }
 
+  void addRegVSRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VSRegs[getVSReg()]));
+  }
+
+  void addRegVSFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VSFRegs[getVSReg()]));
+  }
+
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getCRBit()]));
@@ -867,7 +928,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     // Post-process instructions (typically extended mnemonics)
     ProcessInstruction(Inst, Operands);
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst);
+    Out.EmitInstruction(Inst, STI);
     return false;
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
@@ -1081,10 +1142,16 @@ FixupVariantKind(const MCExpr *E) {
   llvm_unreachable("Invalid expression kind!");
 }
 
-/// Parse an expression.  This differs from the default "parseExpression"
-/// in that it handles complex \code @l/@ha \endcode modifiers.
+/// ParseExpression.  This differs from the default "parseExpression" in that
+/// it handles modifiers.
 bool PPCAsmParser::
 ParseExpression(const MCExpr *&EVal) {
+
+  if (isDarwin())
+    return ParseDarwinExpression(EVal);
+
+  // (ELF Platforms)
+  // Handle \code @l/@ha \endcode
   if (getParser().parseExpression(EVal))
     return true;
 
@@ -1098,6 +1165,55 @@ ParseExpression(const MCExpr *&EVal) {
   return false;
 }
 
+/// ParseDarwinExpression.  (MachO Platforms)
+/// This differs from the default "parseExpression" in that it handles detection
+/// of the \code hi16(), ha16() and lo16() \endcode modifiers.  At present,
+/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
+/// syntax form so it is done here.  TODO: Determine if there is merit in arranging
+/// for this to be done at a higher level.
+bool PPCAsmParser::
+ParseDarwinExpression(const MCExpr *&EVal) {
+  PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
+  switch (getLexer().getKind()) {
+  default:
+    break;
+  case AsmToken::Identifier:
+    // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus
+    // something starting with any other char should be part of the
+    // asm syntax.  If handwritten asm includes an identifier like lo16,
+    // then all bets are off - but no-one would do that, right?
+    StringRef poss = Parser.getTok().getString();
+    if (poss.equals_lower("lo16")) {
+      Variant = PPCMCExpr::VK_PPC_LO;
+    } else if (poss.equals_lower("hi16")) {
+      Variant = PPCMCExpr::VK_PPC_HI;
+    } else if (poss.equals_lower("ha16")) {
+      Variant = PPCMCExpr::VK_PPC_HA;
+    }
+    if (Variant != PPCMCExpr::VK_PPC_None) {
+      Parser.Lex(); // Eat the xx16
+      if (getLexer().isNot(AsmToken::LParen))
+        return Error(Parser.getTok().getLoc(), "expected '('");
+      Parser.Lex(); // Eat the '('
+    }
+    break;
+  }
+
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  if (Variant != PPCMCExpr::VK_PPC_None) {
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+    Parser.Lex(); // Eat the ')'
+    EVal = PPCMCExpr::Create(Variant, EVal, false, getParser().getContext());
+  }
+  return false;
+}
+
+/// ParseOperand
+/// This handles registers in the form 'NN', '%rNN' for ELF platforms and
+/// rNN for MachO.
 bool PPCAsmParser::
 ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
@@ -1121,14 +1237,31 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     }
     return Error(S, "invalid register name");
 
+  case AsmToken::Identifier:
+    // Note that non-register-name identifiers from the compiler will begin
+    // with '_', 'L'/'l' or '"'.  Of course, handwritten asm could include
+    // identifiers like r31foo - so we fall through in the event that parsing
+    // a register name fails.
+    if (isDarwin()) {
+      unsigned RegNo;
+      int64_t IntVal;
+      if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
+        Parser.Lex(); // Eat the identifier token.
+        Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
+        Operands.push_back(Op);
+        return false;
+      }
+    }
+  // Fall-through to process non-register-name identifiers as expression.
   // All other expressions
   case AsmToken::LParen:
   case AsmToken::Plus:
   case AsmToken::Minus:
   case AsmToken::Integer:
-  case AsmToken::Identifier:
   case AsmToken::Dot:
   case AsmToken::Dollar:
+  case AsmToken::Exclaim:
+  case AsmToken::Tilde:
     if (!ParseExpression(EVal))
       break;
     /* fall through */
@@ -1177,11 +1310,25 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       break;
 
     case AsmToken::Integer:
-      if (getParser().parseAbsoluteExpression(IntVal) ||
+      if (!isDarwin()) {
+        if (getParser().parseAbsoluteExpression(IntVal) ||
           IntVal < 0 || IntVal > 31)
         return Error(S, "invalid register number");
+      } else {
+        return Error(S, "unexpected integer value");
+      }
       break;
 
+   case AsmToken::Identifier:
+    if (isDarwin()) {
+      unsigned RegNo;
+      if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
+        Parser.Lex(); // Eat the identifier token.
+        break;
+      }
+    }
+    // Fall-through..
+
     default:
       return Error(S, "invalid memory operand");
     }
@@ -1261,14 +1408,19 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 /// ParseDirective parses the PPC specific directives
 bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
-  if (IDVal == ".word")
-    return ParseDirectiveWord(2, DirectiveID.getLoc());
-  if (IDVal == ".llong")
-    return ParseDirectiveWord(8, DirectiveID.getLoc());
-  if (IDVal == ".tc")
-    return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc());
-  if (IDVal == ".machine")
-    return ParseDirectiveMachine(DirectiveID.getLoc());
+  if (!isDarwin()) {
+    if (IDVal == ".word")
+      return ParseDirectiveWord(2, DirectiveID.getLoc());
+    if (IDVal == ".llong")
+      return ParseDirectiveWord(8, DirectiveID.getLoc());
+    if (IDVal == ".tc")
+      return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc());
+    if (IDVal == ".machine")
+      return ParseDirectiveMachine(DirectiveID.getLoc());
+  } else {
+    if (IDVal == ".machine")
+      return ParseDarwinDirectiveMachine(DirectiveID.getLoc());
+  }
   return true;
 }
 
@@ -1279,7 +1431,7 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
     for (;;) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
-        return true;
+        return false;
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -1303,8 +1455,10 @@ bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) {
   while (getLexer().isNot(AsmToken::EndOfStatement)
          && getLexer().isNot(AsmToken::Comma))
     Parser.Lex();
-  if (getLexer().isNot(AsmToken::Comma))
-    return Error(L, "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::Comma)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
   Parser.Lex();
 
   // Align to word size.
@@ -1314,12 +1468,14 @@ bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) {
   return ParseDirectiveWord(Size, L);
 }
 
-/// ParseDirectiveMachine
+/// ParseDirectiveMachine (ELF platforms)
 ///  ::= .machine [ cpu | "push" | "pop" ]
 bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
   if (getLexer().isNot(AsmToken::Identifier) &&
-      getLexer().isNot(AsmToken::String))
-    return Error(L, "unexpected token in directive");
+      getLexer().isNot(AsmToken::String)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
 
   StringRef CPU = Parser.getTok().getIdentifier();
   Parser.Lex();
@@ -1329,11 +1485,56 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
   // Implement ".machine any" (by doing nothing) for the benefit
   // of existing assembler code.  Likewise, we can then implement
   // ".machine push" and ".machine pop" as no-op.
-  if (CPU != "any" && CPU != "push" && CPU != "pop")
-    return Error(L, "unrecognized machine type");
+  if (CPU != "any" && CPU != "push" && CPU != "pop") {
+    Error(L, "unrecognized machine type");
+    return false;
+  }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(L, "unexpected token in directive");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitMachine(CPU);
+
+  return false;
+}
+
+/// ParseDarwinDirectiveMachine (Mach-o platforms)
+///  ::= .machine cpu-identifier
+bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier) &&
+      getLexer().isNot(AsmToken::String)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+  Parser.Lex();
+
+  // FIXME: this is only the 'default' set of cpu variants.
+  // However we don't act on this information at present, this is simply
+  // allowing parsing to proceed with minimal sanity checking.
+  if (CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64") {
+    Error(L, "unrecognized cpu type");
+    return false;
+  }
+
+  if (isPPC64() && (CPU == "ppc7400" || CPU == "ppc")) {
+    Error(L, "wrong cpu type specified for 64bit");
+    return false;
+  }
+  if (!isPPC64() && CPU == "ppc64") {
+    Error(L, "wrong cpu type specified for 32bit");
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
 
   return false;
 }
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 9a763f5..ea4de63 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_TARGET_DEFINITIONS PPC.td)
 tablegen(LLVM PPCGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM PPCGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM PPCGenCodeEmitter.inc -gen-emitter)
+tablegen(LLVM PPCGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
@@ -34,9 +35,8 @@ add_llvm_target(PowerPCCodeGen
   PPCSelectionDAGInfo.cpp
   )
 
-add_dependencies(LLVMPowerPCCodeGen PowerPCCommonTableGen intrinsics_gen)
-
 add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/Disassembler/CMakeLists.txt b/lib/Target/PowerPC/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..ca457df
--- /dev/null
+++ b/lib/Target/PowerPC/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMPowerPCDisassembler
+  PPCDisassembler.cpp
+  )
diff --git a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..7f29040
--- /dev/null
+++ b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/PowerPC/Disassembler/LLVMBuild.txt ---------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = PowerPCDisassembler
+parent = PowerPC
+required_libraries = MC Support PowerPCDesc PowerPCInfo
+add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/Disassembler/Makefile b/lib/Target/PowerPC/Disassembler/Makefile
new file mode 100644
index 0000000..86e3b47
--- /dev/null
+++ b/lib/Target/PowerPC/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===-- lib/Target/PowerPC/Disassembler/Makefile -----------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCDisassembler
+
+# Hack: we need to include 'main' PPC target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
new file mode 100644
index 0000000..c4a7544
--- /dev/null
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -0,0 +1,345 @@
+//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class PPCDisassembler : public MCDisassembler {
+public:
+  PPCDisassembler(const MCSubtargetInfo &STI)
+    : MCDisassembler(STI) {}
+  virtual ~PPCDisassembler() {}
+
+  // Override MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createPPCDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI) {
+  return new PPCDisassembler(STI);
+}
+
+extern "C" void LLVMInitializePowerPCDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(ThePPC32Target,
+                                         createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(ThePPC64Target,
+                                         createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget,
+                                         createPPCDisassembler);
+}
+
+// FIXME: These can be generated by TableGen from the existing register
+// encoding values!
+
+static const unsigned CRRegs[] = {
+  PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+  PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+static const unsigned CRBITRegs[] = {
+  PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+  PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+  PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+  PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+  PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+  PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+  PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+  PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+
+static const unsigned FRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+
+static const unsigned VRegs[] = {
+  PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+  PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+  PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static const unsigned VSRegs[] = {
+  PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3,
+  PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7,
+  PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11,
+  PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+  PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+  PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+  PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3,
+  PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7,
+  PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11,
+  PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15,
+  PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19,
+  PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23,
+  PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
+  PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
+};
+
+static const unsigned VSFRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+  PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+  PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned GPRegs[] = {
+  PPC::R0, PPC::R1, PPC::R2, PPC::R3,
+  PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+  PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned GP0Regs[] = {
+  PPC::ZERO, PPC::R1, PPC::R2, PPC::R3,
+  PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+  PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned G8Regs[] = {
+  PPC::X0, PPC::X1, PPC::X2, PPC::X3,
+  PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+  PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
+template <std::size_t N>
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned (&Regs)[N]) {
+  assert(RegNo < N && "Invalid register number");
+  Inst.addOperand(MCOperand::CreateReg(Regs[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRRegs);
+}
+
+static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRBITRegs);
+}
+
+static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VRegs);
+}
+
+static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSRegs);
+}
+
+static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSFRegs);
+}
+
+static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GP0Regs);
+}
+
+static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, G8Regs);
+}
+
+#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
+#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address, const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address, const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address, const void *Decoder) {
+  // Decode the memri field (imm, reg), which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 16;
+  uint64_t Disp = Imm & 0xFFFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  switch (Inst.getOpcode()) {
+  default: break;
+  case PPC::LBZU:
+  case PPC::LHAU:
+  case PPC::LHZU:
+  case PPC::LWZU:
+  case PPC::LFSU:
+  case PPC::LFDU:
+    // Add the tied output operand.
+    Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+    break;
+  case PPC::STBU:
+  case PPC::STHU:
+  case PPC::STWU:
+  case PPC::STFSU:
+  case PPC::STFDU:
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(GP0Regs[Base]));
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<16>(Disp)));
+  Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the memrix field (imm, reg), which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 14;
+  uint64_t Disp = Imm & 0x3FFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  if (Inst.getOpcode() == PPC::LDU)
+    // Add the tied output operand.
+    Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+  else if (Inst.getOpcode() == PPC::STDU)
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(GP0Regs[Base]));
+
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<16>(Disp << 2)));
+  Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address, const void *Decoder) {
+  // The cr bit encoding is 0x80 >> cr_reg_num.
+
+  unsigned Zeros = countTrailingZeros(Imm);
+  assert(Zeros < 8 && "Invalid CR bit value");
+
+  Inst.addOperand(MCOperand::CreateReg(CRRegs[7 - Zeros]));
+  return MCDisassembler::Success;
+}
+
+#include "PPCGenDisassemblerTables.inc"
+
+DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  // Get the four bytes of the instruction.
+  uint8_t Bytes[4];
+  Size = 4;
+  if (Region.readBytes(Address, Size, Bytes) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // The instruction is big-endian encoded.
+  uint32_t Inst = (Bytes[0] << 24) |
+                  (Bytes[1] << 16) |
+                  (Bytes[2] <<  8) |
+                  (Bytes[3] <<  0);
+
+  return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
+}
+
diff --git a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
index a605cc4..ab30a11 100644
--- a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
+++ b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMPowerPCAsmPrinter
   PPCInstPrinter.cpp
   )
-
-add_dependencies(LLVMPowerPCAsmPrinter PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 8281b5c..dc54b52 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -149,6 +149,9 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
     case PPC::PRED_NU:
       O << "nu";
       return;
+    case PPC::PRED_BIT_SET:
+    case PPC::PRED_BIT_UNSET:
+      llvm_unreachable("Invalid use of bit predicate code");
     }
     llvm_unreachable("Invalid predicate code");
   }
@@ -184,6 +187,9 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
     case PPC::PRED_NU_PLUS:
       O << "+";
       return;
+    case PPC::PRED_BIT_SET:
+    case PPC::PRED_BIT_UNSET:
+      llvm_unreachable("Invalid use of bit predicate code");
     }
     llvm_unreachable("Invalid predicate code");
   }
@@ -193,6 +199,13 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
   printOperand(MI, OpNo+1, O);
 }
 
+void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 3 && "Invalid u2imm argument!");
+  O << (unsigned int)Value;
+}
+
 void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   int Value = MI->getOperand(OpNo).getImm();
@@ -310,7 +323,10 @@ static const char *stripRegisterPrefix(const char *RegName) {
   switch (RegName[0]) {
   case 'r':
   case 'f':
-  case 'v': return RegName + 1;
+  case 'v':
+    if (RegName[1] == 's')
+      return RegName + 2;
+    return RegName + 1;
   case 'c': if (RegName[1] == 'r') return RegName + 2;
   }
   
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 8a4c03d..4d1df78 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -43,7 +43,7 @@ public:
   void printPredicateOperand(const MCInst *MI, unsigned OpNo,
                              raw_ostream &O, const char *Modifier = 0);
 
-
+  void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/LLVMBuild.txt b/lib/Target/PowerPC/LLVMBuild.txt
index 7b3e843..9d173d6 100644
--- a/lib/Target/PowerPC/LLVMBuild.txt
+++ b/lib/Target/PowerPC/LLVMBuild.txt
@@ -16,18 +16,20 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = PowerPC
 parent = Target
+has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 has_jit = 1
 
 [component_1]
 type = Library
 name = PowerPCCodeGen
 parent = PowerPC
-required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCAsmPrinter PowerPCDesc PowerPCInfo SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC PowerPCAsmPrinter PowerPCDesc PowerPCInfo SelectionDAG Support Target TransformUtils
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index 3efa5ec..3cea65e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -8,5 +8,3 @@ add_llvm_library(LLVMPowerPCDesc
   PPCMachObjectWriter.cpp
   PPCELFObjectWriter.cpp
   )
-
-add_dependencies(LLVMPowerPCDesc PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 0d42081..f7309bb 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -71,14 +71,16 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 namespace {
 
 class PPCAsmBackend : public MCAsmBackend {
-const Target &TheTarget;
+  const Target &TheTarget;
+  bool IsLittleEndian;
 public:
-  PPCAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
+  PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
+    IsLittleEndian(isLittle) {}
 
   unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[PPC::NumTargetFixupKinds] = {
+    const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
@@ -88,17 +90,27 @@ public:
       { "fixup_ppc_half16ds",     0,     14,   0 },
       { "fixup_ppc_nofixup",      0,      0,   0 }
     };
+    const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        2,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    2,      14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24abs",     2,      24,   0 },
+      { "fixup_ppc_brcond14abs", 2,      14,   0 },
+      { "fixup_ppc_half16",      0,      16,   0 },
+      { "fixup_ppc_half16ds",    2,      14,   0 },
+      { "fixup_ppc_nofixup",     0,       0,   0 }
+    };
 
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
+    return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const {
+                  uint64_t Value, bool IsPCRel) const {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -108,8 +120,10 @@ public:
     // For each byte of the fragment that the fixup touches, mask in the bits
     // from the fixup value. The Value has been "split up" into the appropriate
     // bitfields above.
-    for (unsigned i = 0; i != NumBytes; ++i)
-      Data[Offset + i] |= uint8_t((Value >> ((NumBytes - i - 1)*8)) & 0xff);
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i);
+      Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
+    }
   }
 
   bool mayNeedRelaxation(const MCInst &Inst) const {
@@ -152,6 +166,10 @@ public:
     assert(Name == "ppc32" && "Unknown target name!");
     return 4;
   }
+
+  bool isLittleEndian() const {
+    return IsLittleEndian;
+  }
 };
 } // end anonymous namespace
 
@@ -160,7 +178,7 @@ public:
 namespace {
   class DarwinPPCAsmBackend : public PPCAsmBackend {
   public:
-    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T) { }
+    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
 
     MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
       bool is64 = getPointerSize() == 8;
@@ -170,26 +188,18 @@ namespace {
           (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
           MachO::CPU_SUBTYPE_POWERPC_ALL);
     }
-
-    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-      return false;
-    }
   };
 
   class ELFPPCAsmBackend : public PPCAsmBackend {
     uint8_t OSABI;
   public:
-    ELFPPCAsmBackend(const Target &T, uint8_t OSABI) :
-      PPCAsmBackend(T), OSABI(OSABI) { }
+    ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
+      PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
 
 
     MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
       bool is64 = getPointerSize() == 8;
-      return createPPCELFObjectWriter(OS, is64, OSABI);
-    }
-
-    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-      return false;
+      return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
     }
   };
 
@@ -202,5 +212,6 @@ MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
     return new DarwinPPCAsmBackend(T);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-  return new ELFPPCAsmBackend(T, OSABI);
+  bool IsLittleEndian = Triple(TT).getArch() == Triple::ppc64le;
+  return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 54de70e..d19f6a0 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
@@ -27,17 +28,8 @@ namespace {
     virtual unsigned getRelocTypeInner(const MCValue &Target,
                                        const MCFixup &Fixup,
                                        bool IsPCRel) const;
-    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                  bool IsPCRel, bool IsRelocWithSymbol,
-                                  int64_t Addend) const;
-    virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
-                                           const MCValue &Target,
-                                           const MCFragment &F,
-                                           const MCFixup &Fixup,
-                                           bool IsPCRel) const;
-    virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
-                                                    const MCFixup &Fixup,
-                                                    bool IsPCRel) const;
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
   };
 }
 
@@ -49,12 +41,38 @@ PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
 PPCELFObjectWriter::~PPCELFObjectWriter() {
 }
 
+static MCSymbolRefExpr::VariantKind getAccessVariant(const MCFixup &Fixup) {
+  const MCExpr *Expr = Fixup.getValue();
+
+  if (Expr->getKind() != MCExpr::Target)
+    return Fixup.getAccessVariant();
+
+  switch (cast<PPCMCExpr>(Expr)->getKind()) {
+  case PPCMCExpr::VK_PPC_None:
+    return MCSymbolRefExpr::VK_None;
+  case PPCMCExpr::VK_PPC_LO:
+    return MCSymbolRefExpr::VK_PPC_LO;
+  case PPCMCExpr::VK_PPC_HI:
+    return MCSymbolRefExpr::VK_PPC_HI;
+  case PPCMCExpr::VK_PPC_HA:
+    return MCSymbolRefExpr::VK_PPC_HA;
+  case PPCMCExpr::VK_PPC_HIGHERA:
+    return MCSymbolRefExpr::VK_PPC_HIGHERA;
+  case PPCMCExpr::VK_PPC_HIGHER:
+    return MCSymbolRefExpr::VK_PPC_HIGHER;
+  case PPCMCExpr::VK_PPC_HIGHEST:
+    return MCSymbolRefExpr::VK_PPC_HIGHEST;
+  case PPCMCExpr::VK_PPC_HIGHESTA:
+    return MCSymbolRefExpr::VK_PPC_HIGHESTA;
+  }
+  llvm_unreachable("unknown PPCMCExpr kind");
+}
+
 unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const
 {
-  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
-    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Fixup);
 
   // determine the type of the relocation
   unsigned Type;
@@ -356,64 +374,14 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
 
 unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
                                           const MCFixup &Fixup,
-                                          bool IsPCRel,
-                                          bool IsRelocWithSymbol,
-                                          int64_t Addend) const {
+                                          bool IsPCRel) const {
   return getRelocTypeInner(Target, Fixup, IsPCRel);
 }
 
-const MCSymbol *PPCELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
-                                                   const MCValue &Target,
-                                                   const MCFragment &F,
-                                                   const MCFixup &Fixup,
-                                                   bool IsPCRel) const {
-  assert(Target.getSymA() && "SymA cannot be 0");
-  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
-    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
-
-  bool EmitThisSym;
-  switch (Modifier) {
-  // GOT references always need a relocation, even if the
-  // target symbol is local.
-  case MCSymbolRefExpr::VK_GOT:
-  case MCSymbolRefExpr::VK_PPC_GOT_LO:
-  case MCSymbolRefExpr::VK_PPC_GOT_HI:
-  case MCSymbolRefExpr::VK_PPC_GOT_HA:
-    EmitThisSym = true;
-    break;
-  default:
-    EmitThisSym = false;
-    break;
-  } 
-
-  if (EmitThisSym)
-    return &Target.getSymA()->getSymbol().AliasedSymbol();
-  return NULL;
-}
-
-const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Target,
-                                                            const MCFixup &Fixup,
-                                                            bool IsPCRel) const {
-  assert(Target.getSymA() && "SymA cannot be 0");
-  const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
-
-  unsigned RelocType = getRelocTypeInner(Target, Fixup, IsPCRel);
-
-  // The .odp creation emits a relocation against the symbol ".TOC." which
-  // create a R_PPC64_TOC relocation. However the relocation symbol name
-  // in final object creation should be NULL, since the symbol does not
-  // really exist, it is just the reference to TOC base for the current
-  // object file.
-  bool EmitThisSym = RelocType != ELF::R_PPC64_TOC;
-
-  if (EmitThisSym && !Symbol.isTemporary())
-    return &Symbol;
-  return NULL;
-}
-
 MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,
                                                bool Is64Bit,
+                                               bool IsLittleEndian,
                                                uint8_t OSABI) {
   MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
-  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/false);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index f3dddce..18609e1 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -12,11 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
 using namespace llvm;
 
 void PPCMCAsmInfoDarwin::anchor() { }
 
-PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
   if (is64Bit) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
@@ -30,22 +32,28 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
 
   AssemblerDialect = 1;           // New-Style mnemonics.
   SupportsDebugInformation= true; // Debug information.
+
+  // The installed assembler for OSX < 10.6 lacks some directives.
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  UseIntegratedAssembler = true;
 }
 
 void PPCLinuxMCAsmInfo::anchor() { }
 
-PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
+PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit, const Triple& T) {
   if (is64Bit) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
-  IsLittleEndian = false;
+  IsLittleEndian = T.getArch() == Triple::ppc64le;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
   CommentString = "#";
-  GlobalPrefix = "";
-  PrivateGlobalPrefix = ".L";
 
   // Uses '.section' before '.bss' directive
   UsesELFSectionDirectiveForBSS = true;  
@@ -65,5 +73,10 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
   AssemblerDialect = 1;           // New-Style mnemonics.
+
+  if (T.getOS() == llvm::Triple::FreeBSD ||
+      (T.getOS() == llvm::Triple::NetBSD && !is64Bit) ||
+      (T.getOS() == llvm::Triple::OpenBSD && !is64Bit))
+    UseIntegratedAssembler = true;
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 1530e77..cee2cb7 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -18,17 +18,18 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
+class Triple;
 
   class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
     virtual void anchor();
   public:
-    explicit PPCMCAsmInfoDarwin(bool is64Bit);
+    explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
   };
 
   class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
-    explicit PPCLinuxMCAsmInfo(bool is64Bit);
+    explicit PPCLinuxMCAsmInfo(bool is64Bit, const Triple&);
   };
 
 } // namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 346a9be..b259c5d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -33,70 +33,85 @@ class PPCMCCodeEmitter : public MCCodeEmitter {
   PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
 
-  const MCSubtargetInfo &STI;
+  const MCInstrInfo &MCII;
   const MCContext &CTX;
-  Triple TT;
+  bool IsLittleEndian;
 
 public:
-  PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx)
-    : STI(sti), CTX(ctx), TT(STI.getTargetTriple()) {
+  PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool isLittle)
+    : MCII(mcii), CTX(ctx), IsLittleEndian(isLittle) {
   }
   
   ~PPCMCCodeEmitter() {}
 
   unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
   unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                                  SmallVectorImpl<MCFixup> &Fixups) const;
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
   unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
   unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
-                            SmallVectorImpl<MCFixup> &Fixups) const;
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
   unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
   /// getMachineOpValue - Return binary encoding of operand. If the machine
   /// operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
   
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
     // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
     // It's just a nop to keep the register classes happy, so don't
     // generate anything.
     unsigned Opcode = MI.getOpcode();
+    const MCInstrDesc &Desc = MCII.get(Opcode);
     if (Opcode == TargetOpcode::COPY_TO_REGCLASS)
       return;
 
-    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
+    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
 
-    // BL8_NOP etc. all have a size of 8 because of the following 'nop'.
-    unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
-    if (Opcode == PPC::BL8_NOP || Opcode == PPC::BLA8_NOP ||
-        Opcode == PPC::BL8_NOP_TLS)
-      Size = 8;
-    
-    // Output the constant in big endian byte order.
-    int ShiftValue = (Size * 8) - 8;
-    for (unsigned i = 0; i != Size; ++i) {
-      OS << (char)(Bits >> ShiftValue);
-      Bits <<= 8;
+    // Output the constant in big/little endian byte order.
+    unsigned Size = Desc.getSize();
+    if (IsLittleEndian) {
+      for (unsigned i = 0; i != Size; ++i) {
+        OS << (char)Bits;
+        Bits >>= 8;
+      }
+    } else {
+      int ShiftValue = (Size * 8) - 8;
+      for (unsigned i = 0; i != Size; ++i) {
+        OS << (char)(Bits >> ShiftValue);
+        Bits <<= 8;
+      }
     }
     
     ++MCNumEmitted;  // Keep track of the # of mi's emitted.
@@ -110,14 +125,17 @@ MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                             const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
-  return new PPCMCCodeEmitter(MCII, STI, Ctx);
+  Triple TT(STI.getTargetTriple());
+  bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
+  return new PPCMCCodeEmitter(MCII, Ctx, IsLittleEndian);
 }
 
 unsigned PPCMCCodeEmitter::
 getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
   
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
@@ -126,9 +144,10 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups) const {
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
 
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
@@ -138,9 +157,10 @@ unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
 
 unsigned PPCMCCodeEmitter::
 getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
-                       SmallVectorImpl<MCFixup> &Fixups) const {
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
 
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
@@ -150,9 +170,10 @@ getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
 
 unsigned PPCMCCodeEmitter::
 getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
 
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
@@ -161,79 +182,87 @@ getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
   
   // Add a fixup for the immediate field.
-  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+  Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_half16));
   return 0;
 }
 
 unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
-                                            SmallVectorImpl<MCFixup> &Fixups) const {
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
   // Encode (imm, reg) as a memri, which has the low 16-bits as the
   // displacement and the next 5 bits as the register #.
   assert(MI.getOperand(OpNo+1).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 16;
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16;
   
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm())
-    return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
+    return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+  Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_half16));
   return RegBits;
 }
 
 
 unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   // Encode (imm, reg) as a memrix, which has the low 14-bits as the
   // displacement and the next 5 bits as the register #.
   assert(MI.getOperand(OpNo+1).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 14;
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14;
   
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm())
-    return ((getMachineOpValue(MI, MO, Fixups) >> 2) & 0x3FFF) | RegBits;
+    return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+  Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_half16ds));
   return RegBits;
 }
 
 
 unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups);
+  if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI);
   
   // Add a fixup for the TLS register, which simply provides a relocation
   // hint to the linker that this statement is part of a relocation sequence.
   // Return the thread-pointer register's encoding.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_nofixup));
-  return CTX.getRegisterInfo()->getEncodingValue(PPC::X13);
+  Triple TT(STI.getTargetTriple());
+  bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le;
+  return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2);
 }
 
 unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   // For special TLS calls, we need two fixups; one for the branch target
   // (__tls_get_addr), which we create via getDirectBrEncoding as usual,
   // and one for the TLSGD or TLSLD symbol, which is emitted here.
   const MCOperand &MO = MI.getOperand(OpNo+1);
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_nofixup));
-  return getDirectBrEncoding(MI, OpNo, Fixups);
+  return getDirectBrEncoding(MI, OpNo, Fixups, STI);
 }
 
 unsigned PPCMCCodeEmitter::
 get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
   assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
           MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
@@ -244,7 +273,8 @@ get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
 
 unsigned PPCMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
     // The GPR operand should come through here though.
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index d7e8402..c181e03 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -9,9 +9,9 @@
 
 #define DEBUG_TYPE "ppcmcexpr"
 #include "PPCMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAsmInfo.h"
 
 using namespace llvm;
 
@@ -54,7 +54,7 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
                                      const MCAsmLayout *Layout) const {
   MCValue Value;
 
-  if (!Layout || !getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
+  if (!getSubExpr()->EvaluateAsRelocatable(Value, Layout))
     return false;
 
   if (Value.isAbsolute()) {
@@ -86,6 +86,9 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
     }
     Res = MCValue::get(Result);
   } else {
+    if (!Layout)
+      return false;
+
     MCContext &Context = Layout->getAssembler().getContext();
     const MCSymbolRefExpr *Sym = Value.getSymA();
     MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index e44c7c1..5fc7918 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -10,9 +10,9 @@
 #ifndef PPCMCEXPR_H
 #define PPCMCEXPR_H
 
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/MC/MCAsmLayout.h"
 
 namespace llvm {
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index f18d095..105c511 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -39,6 +39,7 @@ using namespace llvm;
 
 // Pin the vtable to this file.
 PPCTargetStreamer::~PPCTargetStreamer() {}
+PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
 static MCInstrInfo *createPPCMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
@@ -72,9 +73,9 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
 
   MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin())
-    MAI = new PPCMCAsmInfoDarwin(isPPC64);
+    MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple);
   else
-    MAI = new PPCLinuxMCAsmInfo(isPPC64);
+    MAI = new PPCLinuxMCAsmInfo(isPPC64, TheTriple);
 
   // Initial state of the frame pointer is R1.
   unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
@@ -112,7 +113,8 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
   formatted_raw_ostream &OS;
 
 public:
-  PPCTargetAsmStreamer(formatted_raw_ostream &OS) : OS(OS) {}
+  PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
+      : PPCTargetStreamer(S), OS(OS) {}
   virtual void emitTCEntry(const MCSymbol &S) {
     OS << "\t.tc ";
     OS << S.getName();
@@ -120,12 +122,33 @@ public:
     OS << S.getName();
     OS << '\n';
   }
+  virtual void emitMachine(StringRef CPU) {
+    OS << "\t.machine " << CPU << '\n';
+  }
 };
 
 class PPCTargetELFStreamer : public PPCTargetStreamer {
+public:
+  PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
   virtual void emitTCEntry(const MCSymbol &S) {
     // Creates a R_PPC64_TOC relocation
-    Streamer->EmitSymbolValue(&S, 8);
+    Streamer.EmitSymbolValue(&S, 8);
+  }
+  virtual void emitMachine(StringRef CPU) {
+    // FIXME: Is there anything to do in here or does this directive only
+    // limit the parser?
+  }
+};
+
+class PPCTargetMachOStreamer : public PPCTargetStreamer {
+public:
+  PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+  virtual void emitTCEntry(const MCSymbol &S) {
+    llvm_unreachable("Unknown pseudo-op: .tc");
+  }
+  virtual void emitMachine(StringRef CPU) {
+    // FIXME: We should update the CPUType, CPUSubType in the Object file if
+    // the new values are different from the defaults.
   }
 };
 }
@@ -135,25 +158,32 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
                                     raw_ostream &OS,
                                     MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  if (Triple(TT).isOSDarwin())
-    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+  if (Triple(TT).isOSDarwin()) {
+    MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+    new PPCTargetMachOStreamer(*S);
+    return S;
+  }
 
-  PPCTargetStreamer *S = new PPCTargetELFStreamer();
-  return createELFStreamer(Ctx, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  MCStreamer *S =
+      createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  new PPCTargetELFStreamer(*S);
+  return S;
 }
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useLoc, bool useCFI,
-                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
-                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
-  PPCTargetStreamer *S = new PPCTargetAsmStreamer(OS);
-
-  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
-                                 useDwarfDirectory, InstPrint, CE, TAB,
-                                 ShowInst);
+                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                    MCAsmBackend *TAB, bool ShowInst) {
+
+  MCStreamer *S =
+      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
+                              InstPrint, CE, TAB, ShowInst);
+  new PPCTargetAsmStreamer(*S, OS);
+  return S;
 }
 
 static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 0b0ca24..474395b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -46,6 +46,7 @@ MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
 /// createPPCELFObjectWriter - Construct an PPC ELF object writer.
 MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
                                          bool Is64Bit,
+                                         bool IsLittleEndian,
                                          uint8_t OSABI);
 /// createPPCELFObjectWriter - Construct a PPC Mach-O object writer.
 MCObjectWriter *createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
index 63facc5..c2987b6 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -42,6 +42,10 @@ PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
   case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS;
   case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS;
   case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS;
+
+  // Simple predicates for single condition-register bits.
+  case PPC::PRED_BIT_SET:   return PPC::PRED_BIT_UNSET;
+  case PPC::PRED_BIT_UNSET: return PPC::PRED_BIT_SET;
   }
   llvm_unreachable("Unknown PPC branch opcode!");
 }
@@ -72,6 +76,10 @@ PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) {
   case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS;
   case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS;
   case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS;
+
+  case PPC::PRED_BIT_SET:
+  case PPC::PRED_BIT_UNSET:
+    llvm_unreachable("Invalid use of bit predicate code");
   }
   llvm_unreachable("Unknown PPC branch opcode!");
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index d498c2f..10e328a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -48,7 +48,12 @@ namespace PPC {
     PRED_GT_PLUS  = (1 << 5) | 15,
     PRED_NE_PLUS  = (2 << 5) |  7,
     PRED_UN_PLUS  = (3 << 5) | 15,
-    PRED_NU_PLUS  = (3 << 5) |  7
+    PRED_NU_PLUS  = (3 << 5) |  7,
+
+    // When dealing with individual condition-register bits, we have simple set
+    // and unset predicates.
+    PRED_BIT_SET =   1024,
+    PRED_BIT_UNSET = 1025
   };
   
   /// Invert the specified predicate.  != -> ==, < -> >=.
diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
index 21fdcd9..c966748 100644
--- a/lib/Target/PowerPC/Makefile
+++ b/lib/Target/PowerPC/Makefile
@@ -16,8 +16,9 @@ BUILT_SOURCES = PPCGenRegisterInfo.inc PPCGenAsmMatcher.inc \
                 PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
                 PPCGenInstrInfo.inc PPCGenDAGISel.inc \
                 PPCGenSubtargetInfo.inc PPCGenCallingConv.inc \
-                PPCGenMCCodeEmitter.inc PPCGenFastISel.inc
+                PPCGenMCCodeEmitter.inc PPCGenFastISel.inc \
+                PPCGenDisassemblerTables.inc
 
-DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
+DIRS = AsmParser Disassembler InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index f0d5af2..c42c5be 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -23,6 +23,7 @@
 
 namespace llvm {
   class PPCTargetMachine;
+  class PassRegistry;
   class FunctionPass;
   class ImmutablePass;
   class JITCodeEmitter;
@@ -35,6 +36,9 @@ namespace llvm {
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
   FunctionPass *createPPCEarlyReturnPass();
+  FunctionPass *createPPCVSXCopyPass();
+  FunctionPass *createPPCVSXCopyCleanupPass();
+  FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
@@ -45,6 +49,9 @@ namespace llvm {
   /// \brief Creates an PPC-specific Target Transformation Info pass.
   ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
 
+  void initializePPCVSXFMAMutatePass(PassRegistry&);
+  extern char &PPCVSXFMAMutateID;
+
   namespace PPCII {
     
   /// Target Operand Flag enum.
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 54e3d40..bd58539 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -51,6 +51,8 @@ def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
 def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
                               "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureCRBits    : SubtargetFeature<"crbits", "UseCRBits", "true",
+                              "Use condition-register bits individually">;
 def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
                                         "Enable Altivec instructions">;
 def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
@@ -88,7 +90,8 @@ def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions">;
 def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
-                                        "Enable VSX instructions">;
+                                        "Enable VSX instructions",
+                                        [FeatureAltivec]>;
 
 def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
                                         "Treat mftb as deprecated">;
@@ -110,6 +113,12 @@ def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
 // their record-form variants.
 class RecFormRel;
 
+// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX
+// FMA instruction forms with their corresponding factor-killing forms.
+class AltVSXFMARel {
+  bit IsVSXFMAAlt = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Relation Map Definitions.
 //===----------------------------------------------------------------------===//
@@ -140,6 +149,19 @@ def getNonRecordFormOpcode : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+def getAltVSXFMAOpcode : InstrMapping {
+  let FilterClass = "AltVSXFMARel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["IsVSXFMAAlt"];
+  // The key column are the (default) addend-killing instructions.
+  let KeyCol = ["0"];
+  // Value columns IsVSXFMAAlt=1
+  let ValueCols = [["1"]];
+}
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@@ -153,12 +175,12 @@ include "PPCInstrInfo.td"
 //
 
 def : Processor<"generic", G3Itineraries, [Directive32]>;
-def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
-                                           FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE, DeprecatedMFTB]>;
-def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
-                                           FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE, DeprecatedMFTB]>;
+def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
+                                          FeatureFRES, FeatureFRSQRTE,
+                                          FeatureBookE, DeprecatedMFTB]>;
+def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
+                                          FeatureFRES, FeatureFRSQRTE,
+                                          FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603,
@@ -254,7 +276,7 @@ def : ProcessorModel<"pwr6x", G5Model,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, Feature64Bit,
                    DeprecatedMFTB, DeprecatedDST]>;
-def : ProcessorModel<"pwr7", G5Model,
+def : ProcessorModel<"pwr7", P7Model,
                   [DirectivePwr7, FeatureAltivec,
                    FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
@@ -283,11 +305,11 @@ include "PPCCallingConv.td"
 
 def PPCInstrInfo : InstrInfo {
   let isLittleEndianEncoding = 1;
-}
 
-def PPCAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
+  // FIXME: Unset this when no longer needed!
+  let decodePositionallyEncodedOperands = 1;
+
+  let noNamedPositionallyEncodedOperands = 1;
 }
 
 def PPCAsmParser : AsmParser {
@@ -306,8 +328,7 @@ def PPCAsmParserVariant : AsmParserVariant {
 def PPC : Target {
   // Information about the instructions.
   let InstructionSet = PPCInstrInfo;
-  
-  let AssemblyWriters = [PPCAsmWriter];
+
   let AssemblyParsers = [PPCAsmParser];
   let AssemblyParserVariants = [PPCAsmParserVariant];
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index ada34ed..9ce8ea9 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -19,24 +19,24 @@
 #define DEBUG_TYPE "asmprinter"
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
-#include "MCTargetDesc/PPCPredicates.h"
 #include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "PPCTargetStreamer.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -54,7 +54,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -130,7 +129,10 @@ static const char *stripRegisterPrefix(const char *RegName) {
   switch (RegName[0]) {
     case 'r':
     case 'f':
-    case 'v': return RegName + 1;
+    case 'v':
+      if (RegName[1] == 's')
+        return RegName + 2;
+      return RegName + 1;
     case 'c': if (RegName[1] == 'r') return RegName + 2;
   }
   
@@ -139,6 +141,7 @@ static const char *stripRegisterPrefix(const char *RegName) {
 
 void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                  raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(OpNo);
   
   switch (MO.getType()) {
@@ -157,37 +160,13 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   case MachineOperand::MO_MachineBasicBlock:
     O << *MO.getMBB()->getSymbol();
     return;
-  case MachineOperand::MO_JumpTableIndex:
-    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
-      << '_' << MO.getIndex();
-    // FIXME: PIC relocation model
-    return;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     return;
   case MachineOperand::MO_BlockAddress:
     O << *GetBlockAddressSymbol(MO.getBlockAddress());
     return;
-  case MachineOperand::MO_ExternalSymbol: {
-    // Computing the address of an external symbol, not calling it.
-    if (TM.getRelocationModel() == Reloc::Static) {
-      O << *GetExternalSymbolSymbol(MO.getSymbolName());
-      return;
-    }
-
-    MCSymbol *NLPSym = 
-      OutContext.GetOrCreateSymbol(StringRef(MAI->getGlobalPrefix())+
-                                   MO.getSymbolName()+"$non_lazy_ptr");
-    MachineModuleInfoImpl::StubValueTy &StubSym = 
-      MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(NLPSym);
-    if (StubSym.getPointer() == 0)
-      StubSym = MachineModuleInfoImpl::
-        StubValueTy(GetExternalSymbolSymbol(MO.getSymbolName()), true);
-    
-    O << *NLPSym;
-    return;
-  }
   case MachineOperand::MO_GlobalAddress: {
     // Computing the address of a global symbol, not calling it.
     const GlobalValue *GV = MO.getGlobal();
@@ -197,7 +176,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     if (TM.getRelocationModel() != Reloc::Static &&
         (GV->isDeclaration() || GV->isWeakForLinker())) {
       if (!GV->hasHiddenVisibility()) {
-        SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym = 
           MMI->getObjFileInfo<MachineModuleInfoMachO>()
             .getGVStubEntry(SymToPrint);
@@ -206,7 +185,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
                  GV->hasAvailableExternallyLinkage()) {
-        SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
         
         MachineModuleInfoImpl::StubValueTy &StubSym = 
           MMI->getObjFileInfo<MachineModuleInfoMachO>().
@@ -305,12 +284,12 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
 MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
-
+  const DataLayout *DL = TM.getDataLayout();
   MCSymbol *&TOCEntry = TOC[Sym];
 
   // To avoid name clash check if the name already exists.
   while (TOCEntry == 0) {
-    if (OutContext.LookupSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+    if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) +
                                 "C" + Twine(TOCLabelID++)) == 0) {
       TOCEntry = GetTempSymbol("C", TOCLabelID);
     }
@@ -325,6 +304,7 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
 ///
 void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
+  bool isPPC64 = Subtarget.isPPC64();
   
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
@@ -340,7 +320,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *PICBase = MF->getPICBaseSymbol();
     
     // Emit the 'bl'.
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL)
       // FIXME: We would like an efficient form for this, so we don't have to do
       // a lot of extra uniquing.
       .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
@@ -376,7 +356,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
       
@@ -401,8 +381,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ?
-        GAlias->resolveAliasedGlobal(false) : GValue;
+      const GlobalValue *RealGValue =
+          GAlias ? GAlias->getAliasedGlobal() : GValue;
       MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
@@ -422,7 +402,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
                               OutContext);
     TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
   case PPC::LDtocL: {
@@ -448,8 +428,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ?
-        GAlias->resolveAliasedGlobal(false) : GValue;
+      const GlobalValue *RealGValue =
+          GAlias ? GAlias->getAliasedGlobal() : GValue;
       MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
     
@@ -463,7 +443,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
   case PPC::ADDItocL: {
@@ -483,8 +463,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ?
-        GAlias->resolveAliasedGlobal(false) : GValue;
+      const GlobalValue *RealGValue =
+          GAlias ? GAlias->getAliasedGlobal() : GValue;
       MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
@@ -499,7 +479,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
                               OutContext);
     TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
   case PPC::ADDISgotTprelHA: {
@@ -512,18 +492,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymGotTprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(PPC::X2)
                                 .addExpr(SymGotTprel));
     return;
   }
-  case PPC::LDgotTprelL: {
+  case PPC::LDgotTprelL:
+  case PPC::LDgotTprelL32: {
     // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
 
     // Change the opcode to LD.
-    TmpInst.setOpcode(PPC::LD);
+    TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -531,7 +512,25 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+
+  case PPC::PPC32GOT: {
+    MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    const MCExpr *SymGotTlsL =
+      MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
+                              OutContext);
+    const MCExpr *SymGotTlsHA =                               
+      MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
+                              OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addExpr(SymGotTlsL));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(0).getReg())
+                                .addExpr(SymGotTlsHA));
     return;
   }
   case PPC::ADDIStlsgdHA: {
@@ -544,7 +543,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymGotTlsGD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(PPC::X2)
                                 .addExpr(SymGotTlsGD));
@@ -560,7 +559,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymGotTlsGD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(MI->getOperand(1).getReg())
                                 .addExpr(SymGotTlsGD));
@@ -581,7 +580,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLS)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP_TLS)
                                 .addExpr(TlsRef)
                                 .addExpr(SymVar));
     return;
@@ -596,7 +595,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymGotTlsLD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(PPC::X2)
                                 .addExpr(SymGotTlsLD));
@@ -612,7 +611,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymGotTlsLD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(MI->getOperand(1).getReg())
                                 .addExpr(SymGotTlsLD));
@@ -633,7 +632,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLS)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP_TLS)
                                 .addExpr(TlsRef)
                                 .addExpr(SymVar));
     return;
@@ -648,7 +647,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(PPC::X3)
                                 .addExpr(SymDtprel));
@@ -664,7 +663,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI8)
                                 .addReg(MI->getOperand(0).getReg())
                                 .addReg(MI->getOperand(1).getReg())
                                 .addExpr(SymDtprel));
@@ -679,7 +678,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
       OutStreamer.AddComment(PPCInstPrinter::
                              getRegisterName(MI->getOperand(1).getReg()));
-      OutStreamer.EmitInstruction(MCInstBuilder(NewOpcode)
+      EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode)
                                   .addReg(MI->getOperand(0).getReg()));
       return;
     }
@@ -695,19 +694,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                               ->getEncodingValue(MI->getOperand(0).getReg());
       OutStreamer.AddComment(PPCInstPrinter::
                              getRegisterName(MI->getOperand(0).getReg()));
-      OutStreamer.EmitInstruction(MCInstBuilder(NewOpcode)
+      EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode)
                                   .addImm(Mask)
                                   .addReg(MI->getOperand(1).getReg()));
       return;
     }
     break;
-  case PPC::SYNC:
-    // In Book E sync is called msync, handle this special case here...
-    if (Subtarget.isBookE()) {
-      OutStreamer.EmitRawText(StringRef("\tmsync"));
-      return;
-    }
-    break;
   case PPC::LD:
   case PPC::STD:
   case PPC::LWA_32:
@@ -730,7 +722,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
-  OutStreamer.EmitInstruction(TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
@@ -773,7 +765,7 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
   PPCTargetStreamer &TS =
-      static_cast<PPCTargetStreamer &>(OutStreamer.getTargetStreamer());
+      static_cast<PPCTargetStreamer &>(*OutStreamer.getTargetStreamer());
 
   if (isPPC64 && !TOC.empty()) {
     const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc",
@@ -861,13 +853,12 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (Subtarget.isPPC64() && Directive < PPC::DIR_64)
     Directive = PPC::DIR_64;
   assert(Directive <= PPC::DIR_64 && "Directive out of range.");
-  
-  // FIXME: This is a total hack, finish mc'izing the PPC backend.
-  if (OutStreamer.hasRawTextSupport()) {
-    assert(Directive < array_lengthof(CPUDirectives) &&
-           "CPUDirectives[] might not be up-to-date!");
-    OutStreamer.EmitRawText("\t.machine " + Twine(CPUDirectives[Directive]));
-  }
+
+  assert(Directive < array_lengthof(CPUDirectives) &&
+         "CPUDirectives[] might not be up-to-date!");
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
+  TStreamer.emitMachine(CPUDirectives[Directive]);
 
   // Prime text sections so they are adjacent.  This reduces the likelihood a
   // large data or debug section causes a branch to exceed 16M limit.
@@ -877,14 +868,14 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (TM.getRelocationModel() == Reloc::PIC_) {
     OutStreamer.SwitchSection(
            OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
-                                      MCSectionMachO::S_SYMBOL_STUBS |
-                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      MachO::S_SYMBOL_STUBS |
+                                      MachO::S_ATTR_PURE_INSTRUCTIONS,
                                       32, SectionKind::getText()));
   } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
     OutStreamer.SwitchSection(
            OutContext.getMachOSection("__TEXT","__symbol_stub1",
-                                      MCSectionMachO::S_SYMBOL_STUBS |
-                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      MachO::S_SYMBOL_STUBS |
+                                      MachO::S_ATTR_PURE_INSTRUCTIONS,
                                       16, SectionKind::getText()));
   }
   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
@@ -916,8 +907,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
   if (TM.getRelocationModel() == Reloc::PIC_) {
     const MCSection *StubSection = 
     OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
-                               MCSectionMachO::S_SYMBOL_STUBS |
-                               MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                               MachO::S_SYMBOL_STUBS |
+                               MachO::S_ATTR_PURE_INSTRUCTIONS,
                                32, SectionKind::getText());
     for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
       OutStreamer.SwitchSection(StubSection);
@@ -937,32 +928,32 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
         MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext);
 
       // mflr r0
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
       // bcl 20, 31, AnonSymbol
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::BCLalways).addExpr(Anon));
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCLalways).addExpr(Anon));
       OutStreamer.EmitLabel(AnonSymbol);
       // mflr r11
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
       // addis r11, r11, ha16(LazyPtr - AnonSymbol)
       const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, isDarwin, OutContext);
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS)
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
         .addReg(PPC::R11)
         .addReg(PPC::R11)
         .addExpr(SubHa16));
       // mtlr r0
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
 
       // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
       // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
       const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, isDarwin, OutContext);
-      OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
+      EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
         .addReg(PPC::R12)
         .addExpr(SubLo16).addExpr(SubLo16)
         .addReg(PPC::R11));
       // mtctr r12
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
       // bctr
-      OutStreamer.EmitInstruction(MCInstBuilder(PPC::BCTR));
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR));
 
       OutStreamer.SwitchSection(LSPSection);
       OutStreamer.EmitLabel(LazyPtr);
@@ -984,8 +975,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
   
   const MCSection *StubSection =
     OutContext.getMachOSection("__TEXT","__symbol_stub1",
-                               MCSectionMachO::S_SYMBOL_STUBS |
-                               MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                               MachO::S_SYMBOL_STUBS |
+                               MachO::S_ATTR_PURE_INSTRUCTIONS,
                                16, SectionKind::getText());
   for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
     MCSymbol *Stub = Stubs[i].first;
@@ -1001,7 +992,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     // lis r11, ha16(LazyPtr)
     const MCExpr *LazyPtrHa16 =
       PPCMCExpr::CreateHa(LazyPtrExpr, isDarwin, OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::LIS)
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LIS)
       .addReg(PPC::R11)
       .addExpr(LazyPtrHa16));
 
@@ -1009,15 +1000,15 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     // lwzu r12, lo16(LazyPtr)(r11)
     const MCExpr *LazyPtrLo16 =
       PPCMCExpr::CreateLo(LazyPtrExpr, isDarwin, OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
+    EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
       .addReg(PPC::R12)
       .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
       .addReg(PPC::R11));
 
     // mtctr r12
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
     // bctr
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BCTR));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR));
 
     OutStreamer.SwitchSection(LSPSection);
     OutStreamer.EmitLabel(LazyPtr);
@@ -1058,7 +1049,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
     for (std::vector<const Function*>::const_iterator I = Personalities.begin(),
          E = Personalities.end(); I != E; ++I) {
       if (*I) {
-        MCSymbol *NLPSym = GetSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
+        MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym =
           MMIMacho.getGVStubEntry(NLPSym);
         StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
@@ -1147,4 +1138,5 @@ static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
 extern "C" void LLVMInitializePowerPCAsmPrinter() { 
   TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass);
 }
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 3e608ca..9276211 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -115,6 +115,9 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
         MachineBasicBlock *Dest = 0;
         if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
           Dest = I->getOperand(2).getMBB();
+        else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
+                 !I->getOperand(1).isImm())
+          Dest = I->getOperand(1).getMBB();
         else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
                   I->getOpcode() == PPC::BDZ8  || I->getOpcode() == PPC::BDZ) &&
                  !I->getOperand(0).isImm())
@@ -166,6 +169,12 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
           // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
           BuildMI(MBB, I, dl, TII->get(PPC::BCC))
             .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        } else if (I->getOpcode() == PPC::BC) {
+          unsigned CRBit = I->getOperand(0).getReg();
+          BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
+        } else if (I->getOpcode() == PPC::BCn) {
+          unsigned CRBit = I->getOperand(0).getReg();
+          BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
         } else if (I->getOpcode() == PPC::BDNZ) {
           BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
         } else if (I->getOpcode() == PPC::BDNZ8) {
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 4224ae2..9c5db50 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -26,28 +26,28 @@
 #define DEBUG_TYPE "ctrloops"
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "PPCTargetMachine.h"
-#include "PPC.h"
 
 #ifndef NDEBUG
 #include "llvm/CodeGen/MachineDominators.h"
@@ -96,8 +96,8 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
-      AU.addRequired<DominatorTree>();
-      AU.addPreserved<DominatorTree>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<ScalarEvolution>();
     }
 
@@ -109,7 +109,7 @@ namespace {
     PPCTargetMachine *TM;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    DataLayout *TD;
+    const DataLayout *DL;
     DominatorTree *DT;
     const TargetLibraryInfo *LibInfo;
   };
@@ -145,7 +145,7 @@ namespace {
 
 INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
@@ -170,8 +170,9 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
 bool PPCCTRLoops::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
-  DT = &getAnalysis<DominatorTree>();
-  TD = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   bool MadeChange = false;
@@ -186,6 +187,13 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   return MadeChange;
 }
 
+static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+    return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+  return false;
+}
+
 bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
        J != JE; ++J) {
@@ -352,13 +360,11 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       CastInst *CI = cast<CastInst>(J);
       if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
           CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
-          (TT.isArch32Bit() &&
-           (CI->getSrcTy()->getScalarType()->isIntegerTy(64) ||
-            CI->getDestTy()->getScalarType()->isIntegerTy(64))
-          ))
+          isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
         return true;
-    } else if (TT.isArch32Bit() &&
-               J->getType()->getScalarType()->isIntegerTy(64) &&
+    } else if (isLargeIntegerTy(TT.isArch32Bit(),
+                                J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::UDiv ||
                 J->getOpcode() == Instruction::SDiv ||
                 J->getOpcode() == Instruction::URem ||
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index e8e7f4c..d48164d 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -15,6 +15,8 @@
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
 class CCIfSubtarget<string F, CCAction A>
  : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+class CCIfNotSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("!State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
 
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
@@ -23,7 +25,8 @@ class CCIfSubtarget<string F, CCAction A>
 // Return-value convention for PowerPC
 def RetCC_PPC : CallingConv<[
   // On PPC64, integer return values are always promoted to i64
-  CCIfType<[i32], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+  CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+  CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
 
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
   CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
@@ -33,7 +36,8 @@ def RetCC_PPC : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>,
   
   // Vector types are always returned in V2.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>,
+  CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>>
 ]>;
 
 
@@ -46,6 +50,7 @@ def RetCC_PPC : CallingConv<[
 // Only handle ints and floats.  All ints are promoted to i64.
 // Vector types and quadword ints are not handled.
 def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i1],  CCPromoteToType<i64>>,
   CCIfType<[i8],  CCPromoteToType<i64>>,
   CCIfType<[i16], CCPromoteToType<i64>>,
   CCIfType<[i32], CCPromoteToType<i64>>,
@@ -58,6 +63,7 @@ def CC_PPC64_ELF_FIS : CallingConv<[
 // and multiple register returns are "supported" to avoid compile
 // errors, but none are handled by the fast selector.
 def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i1],   CCPromoteToType<i64>>,
   CCIfType<[i8],   CCPromoteToType<i64>>,
   CCIfType<[i16],  CCPromoteToType<i64>>,
   CCIfType<[i32],  CCPromoteToType<i64>>,
@@ -65,7 +71,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
   CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
   CCIfType<[f32],  CCAssignToReg<[F1, F2]>>,
   CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4]>>,
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>,
+  CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -73,6 +80,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_PPC32_SVR4_Common : CallingConv<[
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
   // The ABI requires i64 to be passed in two adjacent registers with the first
   // register having an odd register number.
   CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
@@ -97,7 +106,7 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
 
   // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToStack<16, 16>>
+  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
 ]>;
 
 // This calling convention puts vector arguments always on the stack. It is used
@@ -113,6 +122,9 @@ def CC_PPC32_SVR4 : CallingConv<[
   // The first 12 Vector arguments are passed in AltiVec registers.
   CCIfType<[v16i8, v8i16, v4i32, v4f32],
            CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>,
+  CCIfType<[v2f64, v2i64],
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9,
+                          VSH10, VSH11, VSH12, VSH13]>>,
            
   CCDelegateTo<CC_PPC32_SVR4_Common>
 ]>;  
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 418736e..84fc888 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -121,7 +121,8 @@ void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
     default:
       MCE.emitWordBE(getBinaryCodeForInstr(MI));
       break;
-    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::CFI_INSTRUCTION:
+      break;
     case TargetOpcode::EH_LABEL:
       MCE.emitLabel(MI.getOperand(0).getMCSymbol());
       break;
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 09117e7..dd45683 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -15,10 +15,10 @@
 
 #define DEBUG_TYPE "ppcfastisel"
 #include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCISelLowering.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
-#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -28,12 +28,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -80,7 +80,7 @@ typedef struct Address {
    }
 } Address;
 
-class PPCFastISel : public FastISel {
+class PPCFastISel final : public FastISel {
 
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
@@ -127,7 +127,6 @@ class PPCFastISel : public FastISel {
     bool SelectStore(const Instruction *I);
     bool SelectBranch(const Instruction *I);
     bool SelectIndirectBr(const Instruction *I);
-    bool SelectCmp(const Instruction *I);
     bool SelectFPExt(const Instruction *I);
     bool SelectFPTrunc(const Instruction *I);
     bool SelectIToFP(const Instruction *I, bool IsSigned);
@@ -325,11 +324,11 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
            II != IE; ++II, ++GTI) {
         const Value *Op = *II;
         if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-          const StructLayout *SL = TD.getStructLayout(STy);
+          const StructLayout *SL = DL.getStructLayout(STy);
           unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
           TmpOffset += SL->getElementOffset(Idx);
         } else {
-          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
           for (;;) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
@@ -407,7 +406,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
   // register and continue. This should almost never happen.
   if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
     unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
             ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
     Addr.Base.Reg = ResultReg;
     Addr.BaseType = Address::RegBase;
@@ -499,13 +498,13 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
         MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
         MFI.getObjectAlignment(Addr.Base.FI));
 
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
 
   // Base reg with offset in range.
   } else if (UseOffset) {
 
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addImm(Addr.Offset).addReg(Addr.Base.Reg);
 
   // Indexed form.
@@ -529,7 +528,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       case PPC::LFS:    Opc = PPC::LFSX;    break;
       case PPC::LFD:    Opc = PPC::LFDX;    break;
     }
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(Addr.Base.Reg).addReg(IndexReg);
   }
 
@@ -615,12 +614,15 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
         MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
         MFI.getObjectAlignment(Addr.Base.FI));
 
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)).addReg(SrcReg)
-      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(SrcReg)
+        .addImm(Addr.Offset)
+        .addFrameIndex(Addr.Base.FI)
+        .addMemOperand(MMO);
 
   // Base reg with offset in range.
   } else if (UseOffset)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
 
   // Indexed form.
@@ -640,7 +642,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       case PPC::STFS: Opc = PPC::STFSX; break;
       case PPC::STFD: Opc = PPC::STFDX; break;
     }
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
   }
 
@@ -704,9 +706,9 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
                     CondReg))
       return false;
 
-    BuildMI(*BrBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCC))
+    BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
       .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
-    FastEmitBranch(FBB, DL);
+    FastEmitBranch(FBB, DbgLoc);
     FuncInfo.MBB->addSuccessor(TBB);
     return true;
 
@@ -714,7 +716,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
              dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    FastEmitBranch(Target, DL);
+    FastEmitBranch(Target, DbgLoc);
     return true;
   }
 
@@ -737,6 +739,9 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
+  if (SrcVT == MVT::i1 && PPCSubTarget.useCRBits())
+    return false;
+
   // See if operand 2 is an immediate encodeable in the compare.
   // FIXME: Operands are not in canonical order at -O0, so an immediate
   // operand in position 1 is a lost opportunity for now.  We are
@@ -811,10 +816,10 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
   }
 
   if (!UseImm)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
       .addReg(SrcReg1).addReg(SrcReg2);
   else
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
       .addReg(SrcReg1).addImm(Imm);
 
   return true;
@@ -853,7 +858,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
 
   // Round the result to single precision.
   unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP), DestReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
     .addReg(SrcReg);
 
   UpdateValueMap(I, DestReg);
@@ -892,11 +897,13 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
   unsigned LoadOpc = PPC::LFD;
 
   if (SrcVT == MVT::i32) {
-    Addr.Offset = 4;
-    if (!IsSigned)
+    if (!IsSigned) {
       LoadOpc = PPC::LFIWZX;
-    else if (PPCSubTarget.hasLFIWAX())
+      Addr.Offset = 4;
+    } else if (PPCSubTarget.hasLFIWAX()) {
       LoadOpc = PPC::LFIWAX;
+      Addr.Offset = 4;
+    }
   }
 
   const TargetRegisterClass *RC = &PPC::F8RCRegClass;
@@ -970,7 +977,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
     Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
 
   // Generate the convert.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
     .addReg(FPReg);
 
   UpdateValueMap(I, DestReg);
@@ -1042,7 +1049,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
   const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
   if (InRC == &PPC::F4RCRegClass) {
     unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
       .addReg(SrcReg).addImm(PPC::F8RCRegClassID);
     SrcReg = TmpReg;
@@ -1062,7 +1069,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
 
   // Generate the convert.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
     .addReg(SrcReg);
 
   // Now move the integer value from a float register to an integer register.
@@ -1155,8 +1162,10 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
       }
 
       if (UseImm) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
-          .addReg(SrcReg1).addImm(Imm);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                ResultReg)
+            .addReg(SrcReg1)
+            .addImm(Imm);
         UpdateValueMap(I, ResultReg);
         return true;
       }
@@ -1171,7 +1180,7 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   if (ISDOpcode == ISD::SUB)
     std::swap(SrcReg1, SrcReg2);
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
     .addReg(SrcReg1).addReg(SrcReg2);
   UpdateValueMap(I, ResultReg);
   return true;
@@ -1198,7 +1207,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
 
     // Skip vector arguments for now, as well as long double and
     // uint128_t, and anything that isn't passed in a register.
-    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 ||
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 ||
         !VA.isRegLoc() || VA.needsCustom())
       return false;
 
@@ -1211,7 +1220,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   NumBytes = CCInfo.getNextStackOffset();
 
   // Issue CALLSEQ_START.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameSetupOpcode()))
     .addImm(NumBytes);
 
@@ -1270,9 +1279,9 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
       ++NextGPR;
     } else
       ArgReg = NextGPR++;
-      
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-            ArgReg).addReg(Arg);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg);
     RegArgs.push_back(ArgReg);
   }
 
@@ -1285,7 +1294,7 @@ void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
                              const Instruction *I, CallingConv::ID CC,
                              unsigned &NumBytes, bool IsVarArg) {
   // Issue CallSEQ_END.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameDestroyOpcode()))
     .addImm(NumBytes).addImm(0);
 
@@ -1315,14 +1324,14 @@ void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
       const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
       ResultReg = createResultReg(CpyRC);
 
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(SourcePhysReg);
 
     // If necessary, round the floating result to single precision.
     } else if (CopyVT == MVT::f64) {
       ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP),
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP),
               ResultReg).addReg(SourcePhysReg);
 
     // If only the low half of a general register is needed, generate
@@ -1333,7 +1342,7 @@ void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
       ResultReg = createResultReg(&PPC::GPRCRegClass);
       // Convert physical register from G8RC to GPRC.
       SourcePhysReg -= PPC::X0 - PPC::R0;
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(SourcePhysReg);
     }
@@ -1440,7 +1449,7 @@ bool PPCFastISel::SelectCall(const Instruction *I) {
     if (Arg == 0)
       return false;
 
-    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
     Args.push_back(*II);
@@ -1465,7 +1474,7 @@ bool PPCFastISel::SelectCall(const Instruction *I) {
 
   // Build direct call with NOP for TOC restore.
   // FIXME: We can and should optimize away the NOP for local calls.
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(PPC::BL8_NOP));
   // Add callee.
   MIB.addGlobalAddress(GV);
@@ -1522,8 +1531,8 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
       const Constant *C = cast<Constant>(RV);
       unsigned SrcReg = PPCMaterializeInt(C, MVT::i64);
       unsigned RetReg = ValLocs[0].getLocReg();
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              RetReg).addReg(SrcReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
       RetRegs.push_back(RetReg);
 
     } else {
@@ -1578,14 +1587,14 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
           }
         }
 
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::COPY), RetRegs[i])
           .addReg(SrcReg);
       }
     }
   }
 
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(PPC::BLR));
 
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
@@ -1615,7 +1624,7 @@ bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
       assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
       Opc = PPC::EXTSW_32_64;
     }
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
       .addReg(SrcReg);
 
   // Unsigned 32-bit extensions use RLWINM.
@@ -1627,7 +1636,7 @@ bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
       assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
       MB = 16;
     }
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::RLWINM),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM),
             DestReg)
       .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
 
@@ -1640,7 +1649,7 @@ bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
       MB = 48;
     else
       MB = 32;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(PPC::RLDICL_32_64), DestReg)
       .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
   }
@@ -1654,9 +1663,9 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
   if (AddrReg == 0)
     return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::MTCTR8))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8))
     .addReg(AddrReg);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCTR8));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
 
   const IndirectBrInst *IB = cast<IndirectBrInst>(I);
   for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
@@ -1684,7 +1693,8 @@ bool PPCFastISel::SelectTrunc(const Instruction *I) {
   // The only interesting case is when we need to switch register classes.
   if (SrcVT == MVT::i64) {
     unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
             ResultReg).addReg(SrcReg, 0, PPC::sub_32);
     SrcReg = ResultReg;
   }
@@ -1791,7 +1801,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
     return 0;
 
   // All FP constants are loaded from the constant pool.
-  unsigned Align = TD.getPrefTypeAlignment(CFP->getType());
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
   assert(Align > 0 && "Unexpectedly missing alignment information!");
   unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
@@ -1807,25 +1817,25 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
 
   // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
   if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocCPT),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
             TmpReg)
       .addConstantPoolIndex(Idx).addReg(PPC::X2);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
       .addImm(0).addReg(TmpReg).addMemOperand(MMO);
   } else {
     // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
             TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
     // But for large code model, we must generate a LDtocL followed
     // by the LF[SD].
     if (CModel == CodeModel::Large) {
       unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
         .addImm(0).addReg(TmpReg2);
     } else 
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
         .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
         .addReg(TmpReg)
         .addMemOperand(MMO);
@@ -1853,7 +1863,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (!GVar) {
     // If GV is an alias, use the aliasee for determining thread-locality.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal());
   }
 
   // FIXME: We don't yet handle the complexity of TLS.
@@ -1863,8 +1873,10 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
 
   // For small code model, generate a simple TOC load.
   if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtoc), DestReg)
-      .addGlobalAddress(GV).addReg(PPC::X2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
+            DestReg)
+        .addGlobalAddress(GV)
+        .addReg(PPC::X2);
   else {
     // If the address is an externally defined symbol, a symbol with
     // common or externally available linkage, a function address, or a
@@ -1875,7 +1887,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     //       ADDItocL(ADDIStocHA(%X2, GV), GV)
     // Either way, start with the ADDIStocHA:
     unsigned HighPartReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
     // !GVar implies a function address.  An external variable is one
@@ -1884,11 +1896,11 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // on the "if" path here.
     if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
         GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     else
       // Otherwise generate the ADDItocL.
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDItocL),
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
               DestReg).addReg(HighPartReg).addGlobalAddress(GV);
   }
 
@@ -1906,21 +1918,21 @@ unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
   bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
 
   if (isInt<16>(Imm))
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg)
       .addImm(Imm);
   else if (Lo) {
     // Both Lo and Hi have nonzero bits.
     unsigned TmpReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
       .addImm(Hi);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg)
       .addReg(TmpReg).addImm(Lo);
   } else
     // Just Hi bits.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg)
       .addImm(Hi);
   
@@ -1960,7 +1972,7 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
   unsigned TmpReg2;
   if (Imm) {
     TmpReg2 = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::RLDICR),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR),
             TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift);
   } else
     TmpReg2 = TmpReg1;
@@ -1968,14 +1980,14 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
   unsigned TmpReg3, Hi, Lo;
   if ((Hi = (Remainder >> 16) & 0xFFFF)) {
     TmpReg3 = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ORIS8),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8),
             TmpReg3).addReg(TmpReg2).addImm(Hi);
   } else
     TmpReg3 = TmpReg2;
 
   if ((Lo = Remainder & 0xFFFF)) {
     unsigned ResultReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ORI8),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8),
             ResultReg).addReg(TmpReg3).addImm(Lo);
     return ResultReg;
   }
@@ -1987,6 +1999,15 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
 // Materialize an integer constant into a register, and return
 // the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
+  // If we're using CR bit registers for i1 values, handle that as a special
+  // case first.
+  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+    const ConstantInt *CI = cast<ConstantInt>(C);
+    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+    return ImmReg;
+  }
 
   if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
       VT != MVT::i8 && VT != MVT::i1) 
@@ -2000,7 +2021,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
   if (isInt<16>(CI->getSExtValue())) {
     unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
     unsigned ImmReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ImmReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
       .addImm(CI->getSExtValue());
     return ImmReg;
   }
@@ -2049,7 +2070,7 @@ unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
     unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
             ResultReg).addFrameIndex(SI->second).addImm(0);
     return ResultReg;
   }
@@ -2152,6 +2173,15 @@ unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
   if (Opc != ISD::Constant)
     return 0;
 
+  // If we're using CR bit registers for i1 values, handle that as a special
+  // case first.
+  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+    return ImmReg;
+  }
+
   if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
       VT != MVT::i8 && VT != MVT::i1) 
     return 0;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 0ac2ced..d8f491f 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -353,9 +353,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   assert((isDarwinABI || isSVR4ABI) &&
          "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
 
-  // Prepare for frame info.
-  MCSymbol *FrameLabel = 0;
-
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
   if (!isSVR4ABI)
@@ -561,36 +558,37 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Add the "machine moves" for the instructions we generated above, but in
   // reverse order.
   if (needsFrameMoves) {
-    // Mark effective beginning of when frame pointer becomes valid.
-    FrameLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(FrameLabel);
-
     // Show update of SP.
     assert(NegFrameSize);
-    MMI.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(FrameLabel, NegFrameSize));
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
 
     if (HasFP) {
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
-      MMI.addFrameInst(
-          MCCFIInstruction::createOffset(FrameLabel, Reg, FPOffset));
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
 
     if (HasBP) {
       unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
-      MMI.addFrameInst(
-          MCCFIInstruction::createOffset(FrameLabel, Reg, BPOffset));
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
 
     if (MustSaveLR) {
       unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
-      MMI.addFrameInst(
-          MCCFIInstruction::createOffset(FrameLabel, Reg, LROffset));
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
   }
 
-  MCSymbol *ReadyLabel = 0;
-
   // If there is a frame pointer, copy R1 into R31
   if (HasFP) {
     BuildMI(MBB, MBBI, dl, OrInst, FPReg)
@@ -598,19 +596,17 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(SPReg);
 
     if (needsFrameMoves) {
-      ReadyLabel = MMI.getContext().CreateTempSymbol();
-
       // Mark effective beginning of when frame pointer is ready.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
-
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
-      MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(ReadyLabel, Reg));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
   }
 
   if (needsFrameMoves) {
-    MCSymbol *Label = HasFP ? ReadyLabel : FrameLabel;
-
     // Add callee saved registers to move list.
     const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
     for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
@@ -631,14 +627,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       // For 64-bit SVR4 when we have spilled CRs, the spill location
       // is SP+8, not a frame-relative slot.
       if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
-        MMI.addFrameInst(MCCFIInstruction::createOffset(
-            Label, MRI->getDwarfRegNum(PPC::CR2, true), 8));
+        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(PPC::CR2, true), 8));
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
         continue;
       }
 
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
-      MMI.addFrameInst(MCCFIInstruction::createOffset(
-          Label, MRI->getDwarfRegNum(Reg, true), Offset));
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
   }
 }
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 0df50e1..37c85b3 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -15,34 +15,220 @@
 #include "PPCHazardRecognizers.h"
 #include "PPC.h"
 #include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-//===----------------------------------------------------------------------===//
-// PowerPC Scoreboard Hazard Recognizer
-void PPCScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
+bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
+  // FIXME: Move this.
+  if (isBCTRAfterSet(SU))
+    return true;
+
   const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
   if (!MCID)
-    // This is a PPC pseudo-instruction.
-    return;
+    return false;
+
+  if (!MCID->mayLoad())
+    return false;
+
+  // SU is a load; for any predecessors in this dispatch group, that are stores,
+  // and with which we have an ordering dependency, return true.
+  for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+    const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+    if (!PredMCID || !PredMCID->mayStore())
+      continue;
+
+    if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier())
+      continue;
+
+    for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+      if (SU->Preds[i].getSUnit() == CurGroup[j])
+        return true;
+  }
+
+  return false; 
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID)
+    return false;
+
+  if (!MCID->isBranch())
+    return false;
+
+  // SU is a branch; for any predecessors in this dispatch group, with which we
+  // have a data dependence and set the counter register, return true.
+  for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+    const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+    if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR)
+      continue;
+
+    if (SU->Preds[i].isCtrl())
+      continue;
+
+    for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+      if (SU->Preds[i].getSUnit() == CurGroup[j])
+        return true;
+  }
+
+  return false; 
+}
+
+// FIXME: Remove this when we don't need this:
+namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } }
+
+// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
+
+bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
+                                                       unsigned &NSlots) {
+  // FIXME: Indirectly, this information is contained in the itinerary, and
+  // we should derive it from there instead of separately specifying it
+  // here.
+  unsigned IIC = MCID->getSchedClass();
+  switch (IIC) {
+  default:
+    NSlots = 1;
+    break;
+  case PPC::Sched::IIC_IntDivW:
+  case PPC::Sched::IIC_IntDivD:
+  case PPC::Sched::IIC_LdStLoadUpd:
+  case PPC::Sched::IIC_LdStLDU:
+  case PPC::Sched::IIC_LdStLFDU:
+  case PPC::Sched::IIC_LdStLFDUX:
+  case PPC::Sched::IIC_LdStLHA:
+  case PPC::Sched::IIC_LdStLHAU:
+  case PPC::Sched::IIC_LdStLWA:
+  case PPC::Sched::IIC_LdStSTDU:
+  case PPC::Sched::IIC_LdStSTFDU:
+    NSlots = 2;
+    break;
+  case PPC::Sched::IIC_LdStLoadUpdX:
+  case PPC::Sched::IIC_LdStLDUX:
+  case PPC::Sched::IIC_LdStLHAUX:
+  case PPC::Sched::IIC_LdStLWARX:
+  case PPC::Sched::IIC_LdStLDARX:
+  case PPC::Sched::IIC_LdStSTDUX:
+  case PPC::Sched::IIC_LdStSTDCX:
+  case PPC::Sched::IIC_LdStSTWCX:
+  case PPC::Sched::IIC_BrMCRX: // mtcr
+  // FIXME: Add sync/isync (here and in the itinerary).
+    NSlots = 4;
+    break;
+  }
 
-  ScoreboardHazardRecognizer::EmitInstruction(SU);
+  // FIXME: record-form instructions need a different itinerary class.
+  if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1)
+    NSlots = 2;
+
+  switch (IIC) {
+  default:
+    // All multi-slot instructions must come first.
+    return NSlots > 1;
+  case PPC::Sched::IIC_BrCR: // cr logicals
+  case PPC::Sched::IIC_SprMFCR:
+  case PPC::Sched::IIC_SprMFCRF:
+  case PPC::Sched::IIC_SprMTSPR:
+    return true;
+  }
 }
 
 ScheduleHazardRecognizer::HazardType
-PPCScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  if (Stalls == 0 && isLoadAfterStore(SU))
+    return NoopHazard;
+
   return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
 }
 
-void PPCScoreboardHazardRecognizer::AdvanceCycle() {
-  ScoreboardHazardRecognizer::AdvanceCycle();
+bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  unsigned NSlots;
+  if (MCID && mustComeFirst(MCID, NSlots) && CurSlots)
+    return true;
+
+  return ScoreboardHazardRecognizer::ShouldPreferAnother(SU);
+}
+
+unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  // We only need to fill out a maximum of 5 slots here: The 6th slot could
+  // only be a second branch, and otherwise the next instruction will start a
+  // new group.
+  if (isLoadAfterStore(SU) && CurSlots < 6) {
+    unsigned Directive =
+      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+    // If we're using a special group-terminating nop, then we need only one.
+    if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7)
+      return 1;
+
+    return 5 - CurSlots;
+  }
+
+  return ScoreboardHazardRecognizer::PreEmitNoops(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (MCID) {
+    if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) {
+      CurGroup.clear();
+      CurSlots = CurBranches = 0;
+    } else {
+      DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
+                      SU->NodeNum << "): ");
+      DEBUG(DAG->dumpNode(SU));
+
+      unsigned NSlots;
+      bool MustBeFirst = mustComeFirst(MCID, NSlots);
+
+      // If this instruction must come first, but does not, then it starts a
+      // new group.
+      if (MustBeFirst && CurSlots) {
+        CurSlots = CurBranches = 0;
+        CurGroup.clear();
+      }
+
+      CurSlots += NSlots;
+      CurGroup.push_back(SU);
+
+      if (MCID->isBranch())
+        ++CurBranches;
+    }
+  }
+
+  return ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() {
+  return ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("Bottom-up scheduling not supported");
 }
 
-void PPCScoreboardHazardRecognizer::Reset() {
-  ScoreboardHazardRecognizer::Reset();
+void PPCDispatchGroupSBHazardRecognizer::Reset() {
+  CurGroup.clear();
+  CurSlots = CurBranches = 0;
+  return ScoreboardHazardRecognizer::Reset();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
+  unsigned Directive =
+    DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  // If the group has now filled all of its slots, or if we're using a special
+  // group-terminating nop, the group is complete.
+  if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+      CurSlots == 6)  {
+    CurGroup.clear();
+    CurSlots = CurBranches = 0;
+  } else {
+    CurGroup.push_back(0);
+    ++CurSlots;
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 84b8e6d..6b7fe41 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -21,19 +21,30 @@
 
 namespace llvm {
 
-/// PPCScoreboardHazardRecognizer - This class implements a scoreboard-based
-/// hazard recognizer for generic PPC processors.
-class PPCScoreboardHazardRecognizer : public ScoreboardHazardRecognizer {
+/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based
+/// hazard recognizer for PPC ooo processors with dispatch-group hazards.
+class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer {
   const ScheduleDAG *DAG;
+  SmallVector<SUnit *, 7> CurGroup;
+  unsigned CurSlots, CurBranches;
+
+  bool isLoadAfterStore(SUnit *SU);
+  bool isBCTRAfterSet(SUnit *SU);
+  bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots);
 public:
-  PPCScoreboardHazardRecognizer(const InstrItineraryData *ItinData,
+  PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData,
                          const ScheduleDAG *DAG_) :
-    ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_) {}
+    ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
+    CurSlots(0), CurBranches(0) {}
 
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
+  virtual bool ShouldPreferAnother(SUnit* SU);
+  virtual unsigned PreEmitNoops(SUnit *SU);
   virtual void EmitInstruction(SUnit *SU);
   virtual void AdvanceCycle();
+  virtual void RecedeCycle();
   virtual void Reset();
+  virtual void EmitNoop();
 };
 
 /// PPCHazardRecognizer970 - This class defines a finite state automata that
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 6ba6af6..3bbc839 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -34,6 +35,10 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+// FIXME: Remove this once the bug has been fixed!
+cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
+cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
+
 namespace llvm {
   void initializePPCDAGToDAGISelPass(PassRegistry&);
 }
@@ -181,6 +186,12 @@ namespace {
 
 private:
     SDNode *SelectSETCC(SDNode *N);
+
+    void PeepholePPC64();
+    void PeepholdCROps();
+
+    bool AllUsersSelectZero(SDNode *N);
+    void SwapAllSelectUsers(SDNode *N);
   };
 }
 
@@ -261,11 +272,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     DebugLoc dl;
 
     if (PPCLowering.getPointerTy() == MVT::i32) {
-      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
     } else {
-      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RCRegClass);
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
     }
@@ -561,7 +572,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     Opc = PPC::FCMPUS;
   } else {
     assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
-    Opc = PPC::FCMPUD;
+    Opc = PPCSubTarget.hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   }
   return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
@@ -629,7 +640,8 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
 // getVCmpInst: return the vector compare instruction for the specified
 // vector type and condition code. Since this is for altivec specific code,
 // only support the altivec types (v16i8, v8i16, v4i32, and v4f32).
-static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
+static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC,
+                                bool HasVSX) {
   switch (CC) {
     case ISD::SETEQ:
     case ISD::SETUEQ:
@@ -643,7 +655,9 @@ static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
         return PPC::VCMPEQUW;
       // v4f32 != v4f32 could be translate to unordered not equal
       else if (VecVT == MVT::v4f32)
-        return PPC::VCMPEQFP;
+        return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+      else if (VecVT == MVT::v2f64)
+        return PPC::XVCMPEQDP;
       break;
     case ISD::SETLT:
     case ISD::SETGT:
@@ -656,7 +670,9 @@ static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
       else if (VecVT == MVT::v4i32)
         return PPC::VCMPGTSW;
       else if (VecVT == MVT::v4f32)
-        return PPC::VCMPGTFP;
+        return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+      else if (VecVT == MVT::v2f64)
+        return PPC::XVCMPGTDP;
       break;
     case ISD::SETULT:
     case ISD::SETUGT:
@@ -671,17 +687,23 @@ static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
       break;
     case ISD::SETOEQ:
       if (VecVT == MVT::v4f32)
-        return PPC::VCMPEQFP;
+        return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+      else if (VecVT == MVT::v2f64)
+        return PPC::XVCMPEQDP;
       break;
     case ISD::SETOLT:
     case ISD::SETOGT:
     case ISD::SETOLE:
       if (VecVT == MVT::v4f32)
-        return PPC::VCMPGTFP;
+        return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+      else if (VecVT == MVT::v2f64)
+        return PPC::XVCMPGTDP;
       break;
     case ISD::SETOGE:
       if (VecVT == MVT::v4f32)
-        return PPC::VCMPGEFP;
+        return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
+      else if (VecVT == MVT::v2f64)
+        return PPC::XVCMPGEDP;
       break;
     default:
       break;
@@ -692,7 +714,7 @@ static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
 // getVCmpEQInst: return the equal compare instruction for the specified vector
 // type. Since this is for altivec specific code, only support the altivec
 // types (v16i8, v8i16, v4i32, and v4f32).
-static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT) {
+static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT, bool HasVSX) {
   switch (VecVT) {
     case MVT::v16i8:
       return PPC::VCMPEQUB;
@@ -701,13 +723,14 @@ static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT) {
     case MVT::v4i32:
       return PPC::VCMPEQUW;
     case MVT::v4f32:
-      return PPC::VCMPEQFP;
+      return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+    case MVT::v2f64:
+      return PPC::XVCMPEQDP;
     default:
       llvm_unreachable("Invalid integer vector compare condition");
   }
 }
 
-
 SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   SDLoc dl(N);
   unsigned Imm;
@@ -715,7 +738,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
 
-  if (isInt32Immediate(N->getOperand(1), Imm)) {
+  if (!PPCSubTarget.useCRBits() &&
+      isInt32Immediate(N->getOperand(1), Imm)) {
     // We can codegen setcc op, imm very efficiently compared to a brcond.
     // Check for those cases here.
     // setcc op, 0
@@ -796,7 +820,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   if (LHS.getValueType().isVector()) {
     EVT VecVT = LHS.getValueType();
     MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
-    unsigned int VCmpInst = getVCmpInst(VT, CC);
+    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget.hasVSX());
 
     switch (CC) {
       case ISD::SETEQ:
@@ -807,7 +831,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETONE:
       case ISD::SETUNE: {
         SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPC::VNOR, VecVT, VCmp, VCmp);
+        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLNOR :
+                                                               PPC::VNOR,
+                                    VecVT, VCmp, VCmp);
       } 
       case ISD::SETLT:
       case ISD::SETOLT:
@@ -827,24 +853,31 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
           return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
         } else {
           SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-          unsigned int VCmpEQInst = getVCmpEQInst(VT);
+          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
           SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-          return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpGT, VCmpEQ);
+          return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+                                                                 PPC::VOR,
+                                      VecVT, VCmpGT, VCmpEQ);
         }
       }
       case ISD::SETLE:
       case ISD::SETOLE:
       case ISD::SETULE: {
         SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
-        unsigned int VCmpEQInst = getVCmpEQInst(VT);
+        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
         SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpLE, VCmpEQ);
+        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+                                                               PPC::VOR,
+                                    VecVT, VCmpLE, VCmpEQ);
       }
       default:
         llvm_unreachable("Invalid vector compare type: should be expanded by legalize");
     }
   }
 
+  if (PPCSubTarget.useCRBits())
+    return 0;
+
   bool Inv;
   unsigned Idx = getCRIdxForSetCC(CC, Inv);
   SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
@@ -959,8 +992,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     break;
   }
 
-  case ISD::SETCC:
-    return SelectSETCC(N);
+  case ISD::SETCC: {
+    SDNode *SN = SelectSETCC(N);
+    if (SN)
+      return SN;
+    break;
+  }
   case PPCISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
@@ -1122,7 +1159,21 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isMask_64(Imm64)) {
       SDValue Val = N->getOperand(0);
       MB = 64 - CountTrailingOnes_64(Imm64);
-      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB) };
+      SH = 0;
+
+      // If the operand is a logical right shift, we can fold it into this
+      // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
+      // for n <= mb. The right shift is really a left rotate followed by a
+      // mask, and this mask is a more-restrictive sub-mask of the mask implied
+      // by the shift.
+      if (Val.getOpcode() == ISD::SRL &&
+          isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
+        assert(Imm < 64 && "Illegal shift amount");
+        Val = Val.getOperand(0);
+        SH = 64 - Imm;
+      }
+
+      SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB) };
       return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
     }
     // AND X, 0 -> 0, not "rlwinm 32".
@@ -1179,11 +1230,39 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  // FIXME: Remove this once the ANDI glue bug is fixed:
+  case PPCISD::ANDIo_1_EQ_BIT:
+  case PPCISD::ANDIo_1_GT_BIT: {
+    if (!ANDIGlueBug)
+      break;
+
+    EVT InVT = N->getOperand(0).getValueType();
+    assert((InVT == MVT::i64 || InVT == MVT::i32) &&
+           "Invalid input type for ANDIo_1_EQ_BIT");
+
+    unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo;
+    SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue,
+                                        N->getOperand(0),
+                                        CurDAG->getTargetConstant(1, InVT)), 0);
+    SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+    SDValue SRIdxVal =
+      CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
+                                PPC::sub_eq : PPC::sub_gt, MVT::i32);
+
+    return CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1,
+                                CR0Reg, SRIdxVal,
+                                SDValue(AndI.getNode(), 1) /* glue */);
+  }
   case ISD::SELECT_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
     EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
     bool isPPC64 = (PtrVT == MVT::i64);
 
+    // If this is a select of i1 operands, we'll pattern match it.
+    if (PPCSubTarget.useCRBits() &&
+        N->getOperand(0).getValueType() == MVT::i1)
+      break;
+
     // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
     if (!isPPC64)
       if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
@@ -1202,6 +1281,36 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
             }
 
     SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+
+    if (N->getValueType(0) == MVT::i1) {
+      // An i1 select is: (c & t) | (!c & f).
+      bool Inv;
+      unsigned Idx = getCRIdxForSetCC(CC, Inv);
+
+      unsigned SRI;
+      switch (Idx) {
+      default: llvm_unreachable("Invalid CC index");
+      case 0: SRI = PPC::sub_lt; break;
+      case 1: SRI = PPC::sub_gt; break;
+      case 2: SRI = PPC::sub_eq; break;
+      case 3: SRI = PPC::sub_un; break;
+      }
+
+      SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg);
+
+      SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1,
+                                              CCBit, CCBit), 0);
+      SDValue C =    Inv ? NotCCBit : CCBit,
+              NotC = Inv ? CCBit    : NotCCBit;
+
+      SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+                                           C, N->getOperand(2)), 0);
+      SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+                                              NotC, N->getOperand(3)), 0);
+
+      return CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
+    }
+
     unsigned BROpc = getPredicateForSetCC(CC);
 
     unsigned SelectCCOp;
@@ -1220,6 +1329,50 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                         getI32Imm(BROpc) };
     return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
   }
+  case ISD::VSELECT:
+    if (PPCSubTarget.hasVSX()) {
+      SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
+      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops, 3);
+    }
+
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    if (PPCSubTarget.hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+                                  N->getValueType(0) == MVT::v2i64)) {
+      ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+      
+      SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
+              Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
+      unsigned DM[2];
+
+      for (int i = 0; i < 2; ++i)
+        if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2)
+          DM[i] = 0;
+        else
+          DM[i] = 1;
+
+      SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), MVT::i32);
+
+      if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 &&
+          Op1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          isa<LoadSDNode>(Op1.getOperand(0))) {
+        LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
+        SDValue Base, Offset;
+
+        if (LD->isUnindexed() &&
+            SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
+          SDValue Chain = LD->getChain();
+          SDValue Ops[] = { Base, Offset, Chain };
+          return CurDAG->SelectNodeTo(N, PPC::LXVDSX,
+                                      N->getValueType(0), Ops, 3);
+        }
+      }
+
+      SDValue Ops[] = { Op1, Op2, DMV };
+      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops, 3);
+    }
+
+    break;
   case PPCISD::BDNZ:
   case PPCISD::BDZ: {
     bool IsPPC64 = PPCSubTarget.isPPC64();
@@ -1244,8 +1397,30 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::BR_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    unsigned PCC = getPredicateForSetCC(CC);
+
+    if (N->getOperand(2).getValueType() == MVT::i1) {
+      unsigned Opc;
+      bool Swap;
+      switch (PCC) {
+      default: llvm_unreachable("Unexpected Boolean-operand predicate");
+      case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true;  break;
+      case PPC::PRED_LE: Opc = PPC::CRORC;  Swap = true;  break;
+      case PPC::PRED_EQ: Opc = PPC::CREQV;  Swap = false; break;
+      case PPC::PRED_GE: Opc = PPC::CRORC;  Swap = false; break;
+      case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break;
+      case PPC::PRED_NE: Opc = PPC::CRXOR;  Swap = false; break;
+      }
+
+      SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
+                                             N->getOperand(Swap ? 3 : 2),
+                                             N->getOperand(Swap ? 2 : 3)), 0);
+      return CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other,
+                                  BitComp, N->getOperand(4), N->getOperand(0));
+    }
+
     SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
-    SDValue Ops[] = { getI32Imm(getPredicateForSetCC(CC)), CondCode,
+    SDValue Ops[] = { getI32Imm(PCC), CondCode,
                         N->getOperand(4), N->getOperand(0) };
     return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
   }
@@ -1288,8 +1463,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ?
-        GAlias->resolveAliasedGlobal(false) : GValue;
+      const GlobalValue *RealGValue =
+          GAlias ? GAlias->getAliasedGlobal() : GValue;
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       assert((GVar || isa<Function>(RealGValue)) &&
              "Unexpected global value subclass!");
@@ -1382,7 +1557,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
-/// PostProcessISelDAG - Perform some late peephole optimizations
+/// PostprocessISelDAG - Perform some late peephole optimizations
 /// on the DAG representation.
 void PPCDAGToDAGISel::PostprocessISelDAG() {
 
@@ -1390,6 +1565,478 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
 
+  PeepholePPC64();
+  PeepholdCROps();
+}
+
+// Check if all users of this node will become isel where the second operand
+// is the constant zero. If this is so, and if we can negate the condition,
+// then we can flip the true and false operands. This will allow the zero to
+// be folded with the isel so that we don't need to materialize a register
+// containing zero.
+bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
+  // If we're not using isel, then this does not matter.
+  if (!PPCSubTarget.hasISEL())
+    return false;
+
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (!User->isMachineOpcode())
+      return false;
+    if (User->getMachineOpcode() != PPC::SELECT_I4 &&
+        User->getMachineOpcode() != PPC::SELECT_I8)
+      return false;
+
+    SDNode *Op2 = User->getOperand(2).getNode();
+    if (!Op2->isMachineOpcode())
+      return false;
+
+    if (Op2->getMachineOpcode() != PPC::LI &&
+        Op2->getMachineOpcode() != PPC::LI8)
+      return false;
+
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0));
+    if (!C)
+      return false;
+
+    if (!C->isNullValue())
+      return false;
+  }
+
+  return true;
+}
+
+void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
+  SmallVector<SDNode *, 4> ToReplace;
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
+            User->getMachineOpcode() == PPC::SELECT_I8) &&
+           "Must have all select users");
+    ToReplace.push_back(User);
+  }
+
+  for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
+       UE = ToReplace.end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    SDNode *ResNode =
+      CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
+                             User->getValueType(0), User->getOperand(0),
+                             User->getOperand(2),
+                             User->getOperand(1));
+
+      DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+      DEBUG(User->dump(CurDAG));
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(ResNode->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+
+      ReplaceUses(User, ResNode);
+  }
+}
+
+void PPCDAGToDAGISel::PeepholdCROps() {
+  bool IsModified;
+  do {
+    IsModified = false;
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode || MachineNode->use_empty())
+        continue;
+      SDNode *ResNode = MachineNode;
+
+      bool Op1Set   = false, Op1Unset = false,
+           Op1Not   = false,
+           Op2Set   = false, Op2Unset = false,
+           Op2Not   = false;
+
+      unsigned Opcode = MachineNode->getMachineOpcode();
+      switch (Opcode) {
+      default: break;
+      case PPC::CRAND:
+      case PPC::CRNAND:
+      case PPC::CROR:
+      case PPC::CRXOR:
+      case PPC::CRNOR:
+      case PPC::CREQV:
+      case PPC::CRANDC:
+      case PPC::CRORC: {
+        SDValue Op = MachineNode->getOperand(1);
+        if (Op.isMachineOpcode()) {
+          if (Op.getMachineOpcode() == PPC::CRSET)
+            Op2Set = true;
+          else if (Op.getMachineOpcode() == PPC::CRUNSET)
+            Op2Unset = true;
+          else if (Op.getMachineOpcode() == PPC::CRNOR &&
+                   Op.getOperand(0) == Op.getOperand(1))
+            Op2Not = true;
+        }
+        }  // fallthrough
+      case PPC::BC:
+      case PPC::BCn:
+      case PPC::SELECT_I4:
+      case PPC::SELECT_I8:
+      case PPC::SELECT_F4:
+      case PPC::SELECT_F8:
+      case PPC::SELECT_VRRC: {
+        SDValue Op = MachineNode->getOperand(0);
+        if (Op.isMachineOpcode()) {
+          if (Op.getMachineOpcode() == PPC::CRSET)
+            Op1Set = true;
+          else if (Op.getMachineOpcode() == PPC::CRUNSET)
+            Op1Unset = true;
+          else if (Op.getMachineOpcode() == PPC::CRNOR &&
+                   Op.getOperand(0) == Op.getOperand(1))
+            Op1Not = true;
+        }
+        }
+        break;
+      }
+
+      bool SelectSwap = false;
+      switch (Opcode) {
+      default: break;
+      case PPC::CRAND:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // x & x = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Set)
+          // 1 & y = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Set)
+          // x & 1 = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset || Op2Unset)
+          // x & 0 = 0 & y = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Not)
+          // ~x & y = andc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0));
+        else if (Op2Not)
+          // x & ~y = andc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRNAND:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // nand(x, x) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Set)
+          // nand(1, y) -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Set)
+          // nand(x, 1) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Unset || Op2Unset)
+          // nand(x, 0) = nand(0, y) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Not)
+          // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // nand(x, ~y) = ~x | y = orc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CROR:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // x | x = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Set || Op2Set)
+          // x | 1 = 1 | y = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Unset)
+          // 0 | y = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Unset)
+          // x | 0 = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // ~x | y = orc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0));
+        else if (Op2Not)
+          // x | ~y = orc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRXOR:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // xor(x, x) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // xor(1, y) -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Set)
+          // xor(x, 1) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Unset)
+          // xor(0, y) = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Unset)
+          // xor(x, 0) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // xor(~x, y) = eqv(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // xor(x, ~y) = eqv(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRNOR:
+        if (Op1Set || Op2Set)
+          // nor(1, y) -> 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Unset)
+          // nor(0, y) = ~y -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Unset)
+          // nor(x, 0) = ~x
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Not)
+          // nor(~x, y) = andc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // nor(x, ~y) = andc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CREQV:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // eqv(x, x) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // eqv(1, y) = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Set)
+          // eqv(x, 1) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset)
+          // eqv(0, y) = ~y -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Unset)
+          // eqv(x, 0) = ~x
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Not)
+          // eqv(~x, y) = xor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // eqv(x, ~y) = xor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1)),
+          SelectSwap = true;
+        break;
+      case PPC::CRANDC:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // andc(x, x) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // andc(1, y) = ~y
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op1Unset || Op2Set)
+          // andc(0, y) = andc(x, 1) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op2Unset)
+          // andc(x, 0) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // andc(~x, y) = ~(x | y) = nor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // andc(x, ~y) = x & y
+          ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0)),
+          SelectSwap = true;
+        break;
+      case PPC::CRORC:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // orc(x, x) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set || Op2Unset)
+          // orc(1, y) = orc(x, 0) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op2Set)
+          // orc(x, 1) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset)
+          // orc(0, y) = ~y
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op1Not)
+          // orc(~x, y) = ~(x & y) = nand(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // orc(x, ~y) = x | y
+          ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode))
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0)),
+          SelectSwap = true;
+        break;
+      case PPC::SELECT_I4:
+      case PPC::SELECT_I8:
+      case PPC::SELECT_F4:
+      case PPC::SELECT_F8:
+      case PPC::SELECT_VRRC:
+        if (Op1Set)
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op1Unset)
+          ResNode = MachineNode->getOperand(2).getNode();
+        else if (Op1Not)
+          ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(),
+                                           SDLoc(MachineNode),
+                                           MachineNode->getValueType(0),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0),
+                                           MachineNode->getOperand(2),
+                                           MachineNode->getOperand(1));
+        break;
+      case PPC::BC:
+      case PPC::BCn:
+        if (Op1Not)
+          ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn :
+                                                               PPC::BC,
+                                           SDLoc(MachineNode),
+                                           MVT::Other,
+                                           MachineNode->getOperand(0).
+                                             getOperand(0),
+                                           MachineNode->getOperand(1),
+                                           MachineNode->getOperand(2));
+        // FIXME: Handle Op1Set, Op1Unset here too.
+        break;
+      }
+
+      // If we're inverting this node because it is used only by selects that
+      // we'd like to swap, then swap the selects before the node replacement.
+      if (SelectSwap)
+        SwapAllSelectUsers(MachineNode);
+
+      if (ResNode != MachineNode) {
+        DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+        DEBUG(MachineNode->dump(CurDAG));
+        DEBUG(dbgs() << "\nNew: ");
+        DEBUG(ResNode->dump(CurDAG));
+        DEBUG(dbgs() << "\n");
+
+        ReplaceUses(MachineNode, ResNode);
+        IsModified = true;
+      }
+    }
+    if (IsModified)
+      CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
+
+void PPCDAGToDAGISel::PeepholePPC64() {
   // These optimizations are currently supported only for 64-bit SVR4.
   if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
     return;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 8da5f05..32ac1dc 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -46,6 +46,9 @@ cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hi
 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 
+// FIXME: Remove this once the bug has been fixed!
+extern cl::opt<bool> ANDIGlueBug;
+
 static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
   if (TM.getSubtargetImpl()->isDarwin())
     return new TargetLoweringObjectFileMachO();
@@ -94,6 +97,39 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
 
+  if (Subtarget->useCRBits()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+    if (isPPC64 || Subtarget->hasFPCVT()) {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
+                         isPPC64 ? MVT::i64 : MVT::i32);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType (ISD::UINT_TO_FP, MVT::i1, 
+                         isPPC64 ? MVT::i64 : MVT::i32);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
+    }
+
+    // PowerPC does not support direct load / store of condition registers
+    setOperationAction(ISD::LOAD, MVT::i1, Custom);
+    setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+    // FIXME: Remove this once the ANDI glue bug is fixed:
+    if (ANDIGlueBug)
+      setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
+
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+    setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+    setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+    setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+    setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+    addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
+  }
+
   // This is used in the ppcf128->int sequence.  Note it has different semantics
   // from FP_ROUND:  that rounds to nearest, this rounds to zero.
   setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
@@ -191,21 +227,25 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::ROTR, MVT::i32   , Expand);
   setOperationAction(ISD::ROTR, MVT::i64   , Expand);
 
-  // PowerPC does not have Select
-  setOperationAction(ISD::SELECT, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::i64, Expand);
-  setOperationAction(ISD::SELECT, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  if (!Subtarget->useCRBits()) {
+    // PowerPC does not have Select
+    setOperationAction(ISD::SELECT, MVT::i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  }
 
   // PowerPC wants to turn select_cc of FP into fsel when possible.
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
   // PowerPC wants to optimize integer setcc a bit
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  if (!Subtarget->useCRBits())
+    setOperationAction(ISD::SETCC, MVT::i32, Custom);
 
   // PowerPC does not have BRCOND which requires SetCC
-  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  if (!Subtarget->useCRBits())
+    setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
   setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
 
@@ -445,7 +485,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::OR    , MVT::v4i32, Legal);
     setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
     setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
-    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4i32,
+                       Subtarget->useCRBits() ? Legal : Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
@@ -464,7 +505,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
 
-    if (TM.Options.UnsafeFPMath) {
+    if (TM.Options.UnsafeFPMath || Subtarget->hasVSX()) {
       setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     }
@@ -491,6 +532,83 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
     setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
+
+    if (Subtarget->hasVSX()) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+      setOperationAction(ISD::MUL, MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+
+      // Share the Altivec comparison restrictions.
+      setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand);
+
+      setCondCodeAction(ISD::SETO,   MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
+
+      setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+      setOperationAction(ISD::STORE, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
+
+      addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
+
+      addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
+      addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
+
+      // VSX v2i64 only supports non-arithmetic operations.
+      setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+      setOperationAction(ISD::SUB, MVT::v2i64, Expand);
+
+      setOperationAction(ISD::SHL, MVT::v2i64, Expand);
+      setOperationAction(ISD::SRA, MVT::v2i64, Expand);
+      setOperationAction(ISD::SRL, MVT::v2i64, Expand);
+
+      setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+
+      setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+      AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
+      setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+      AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
+
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
+
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+
+      // Vector operation legalization checks the result type of
+      // SIGN_EXTEND_INREG, overall legalization checks the inner type.
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+
+      addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
+    }
   }
 
   if (Subtarget->has64BitSupport()) {
@@ -522,9 +640,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
+  if (Subtarget->useCRBits())
+    setTargetDAGCombine(ISD::BRCOND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+
+  if (Subtarget->useCRBits()) {
+    setTargetDAGCombine(ISD::TRUNCATE);
+    setTargetDAGCombine(ISD::SETCC);
+    setTargetDAGCombine(ISD::SELECT_CC);
+  }
+
   // Use reciprocal estimates.
   if (TM.Options.UnsafeFPMath) {
     setTargetDAGCombine(ISD::FDIV);
@@ -545,6 +675,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
   }
 
+  // With 32 condition bits, we don't need to sink (and duplicate) compares
+  // aggressively in CodeGenPrep.
+  if (Subtarget->useCRBits())
+    setHasMultipleConditionRegisters();
+
   setMinFunctionAlignment(2);
   if (PPCSubTarget.isDarwin())
     setPrefFunctionAlignment(4);
@@ -670,6 +805,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ADDIS_TOC_HA:    return "PPCISD::ADDIS_TOC_HA";
   case PPCISD::LD_TOC_L:        return "PPCISD::LD_TOC_L";
   case PPCISD::ADDI_TOC_L:      return "PPCISD::ADDI_TOC_L";
+  case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
   case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
   case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
   case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
@@ -688,7 +824,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
-    return MVT::i32;
+    return PPCSubTarget.useCRBits() ? MVT::i1 : MVT::i32;
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -754,8 +890,8 @@ bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
 ///
 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
                      unsigned LHSStart, unsigned RHSStart) {
-  assert(N->getValueType(0) == MVT::v16i8 &&
-         "PPC only supports shuffles by bytes!");
+  if (N->getValueType(0) != MVT::v16i8)
+    return false;
   assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
          "Unsupported merge size!");
 
@@ -792,8 +928,8 @@ bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
 /// amount, otherwise return -1.
 int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
-  assert(N->getValueType(0) == MVT::v16i8 &&
-         "PPC only supports shuffles by bytes!");
+  if (N->getValueType(0) != MVT::v16i8)
+    return false;
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
 
@@ -1431,18 +1567,19 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
   }
 
-  if (!is64bit)
-    llvm_unreachable("only local-exec is currently supported for ppc32");
-
   if (Model == TLSModel::InitialExec) {
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
     SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                 PPCII::MO_TLS);
-    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
-    SDValue TPOffsetHi = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
-                                     PtrVT, GOTReg, TGA);
+    SDValue GOTPtr;
+    if (is64bit) {
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
+                           PtrVT, GOTReg, TGA);
+    } else
+      GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
     SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
-                                   PtrVT, TGA, TPOffsetHi);
+                                   PtrVT, TGA, GOTPtr);
     return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
   }
 
@@ -1534,6 +1671,27 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDLoc dl(Op);
 
+  if (Op.getValueType() == MVT::v2i64) {
+    // When the operands themselves are v2i64 values, we need to do something
+    // special because VSX has no underlying comparison operations for these.
+    if (Op.getOperand(0).getValueType() == MVT::v2i64) {
+      // Equality can be handled by casting to the legal type for Altivec
+      // comparisons, everything else needs to be expanded.
+      if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+        return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                 DAG.getSetCC(dl, MVT::v4i32,
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
+                   CC));
+      }
+
+      return SDValue();
+    }
+
+    // We handle most of these in the usual way.
+    return Op;
+  }
+
   // If we're comparing for equality to zero, expose the fact that this is
   // implented as a ctlz/srl pair on ppc, so that the dag combiner can
   // fold the new nodes.
@@ -1922,7 +2080,7 @@ static const uint16_t *GetFPR() {
 /// the stack.
 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
                                        unsigned PtrByteSize) {
-  unsigned ArgSize = ArgVT.getSizeInBits()/8;
+  unsigned ArgSize = ArgVT.getStoreSize();
   if (Flags.isByVal())
     ArgSize = Flags.getByValSize();
   ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
@@ -2020,6 +2178,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
       switch (ValVT.getSimpleVT().SimpleTy) {
         default:
           llvm_unreachable("ValVT not supported by formal arguments Lowering");
+        case MVT::i1:
         case MVT::i32:
           RC = &PPC::GPRCRegClass;
           break;
@@ -2027,7 +2186,10 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
           RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
-          RC = &PPC::F8RCRegClass;
+          if (PPCSubTarget.hasVSX())
+            RC = &PPC::VSFRCRegClass;
+          else
+            RC = &PPC::F8RCRegClass;
           break;
         case MVT::v16i8:
         case MVT::v8i16:
@@ -2035,18 +2197,26 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
         case MVT::v4f32:
           RC = &PPC::VRRCRegClass;
           break;
+        case MVT::v2f64:
+        case MVT::v2i64:
+          RC = &PPC::VSHRCRegClass;
+          break;
       }
 
       // Transform the arguments stored in physical registers into virtual ones.
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
+                                            ValVT == MVT::i1 ? MVT::i32 : ValVT);
+
+      if (ValVT == MVT::i1)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
 
       InVals.push_back(ArgValue);
     } else {
       // Argument stored in memory.
       assert(VA.isMemLoc());
 
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = VA.getLocVT().getStoreSize();
       int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
                                       isImmutable);
 
@@ -2182,7 +2352,7 @@ PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
     ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
 
-  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+  return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
 }
 
 // Set the size that is at least reserved in caller of this function.  Tail
@@ -2246,6 +2416,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
+  static const uint16_t VSRH[] = {
+    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
+    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
+  };
 
   const unsigned Num_GPR_Regs = array_lengthof(GPR);
   const unsigned Num_FPR_Regs = 13;
@@ -2265,7 +2439,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
-    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ObjSize = ObjectVT.getStoreSize();
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
     std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
@@ -2275,7 +2449,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
 
     // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
     if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
-        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 ||
+        ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64) {
       if (isVarArg) {
         MinReservedArea = ((MinReservedArea+15)/16)*16;
         MinReservedArea += CalculateStackSlotSize(ObjectVT,
@@ -2333,7 +2508,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
             EVT ObjType = (ObjSize == 1 ? MVT::i8 :
                            (ObjSize == 2 ? MVT::i16 : MVT::i32));
             Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                      MachinePointerInfo(FuncArg, CurArgOffset),
+                                      MachinePointerInfo(FuncArg),
                                       ObjType, false, false, 0);
           } else {
             // For sizes that don't fit a truncating store (3, 5, 6, 7),
@@ -2345,7 +2520,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
             int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                 MachinePointerInfo(FuncArg, ArgOffset),
+                                 MachinePointerInfo(FuncArg),
                                  false, false, 0);
           }
 
@@ -2369,7 +2544,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       MachinePointerInfo(FuncArg, j),
                                        false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -2384,13 +2559,14 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
 
     switch (ObjectVT.getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
     case MVT::i32:
     case MVT::i64:
       if (GPR_idx != Num_GPR_Regs) {
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
-        if (ObjectVT == MVT::i32)
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
@@ -2416,7 +2592,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         if (ObjectVT == MVT::f32)
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
         else
-          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+          VReg = MF.addLiveIn(FPR[FPR_idx], PPCSubTarget.hasVSX() ?
+                                            &PPC::VSFRCRegClass :
+                                            &PPC::F8RCRegClass);
 
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
@@ -2431,10 +2609,14 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
       // Note that vector arguments in registers don't reserve stack space,
       // except in varargs functions.
       if (VR_idx != Num_VR_Regs) {
-        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
+                        MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
+                        MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         if (isVarArg) {
           while ((ArgOffset % 16) != 0) {
@@ -2581,6 +2763,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
       switch(ObjectVT.getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unhandled argument type!");
+      case MVT::i1:
       case MVT::i32:
       case MVT::f32:
         VecArgOffset += 4;
@@ -2665,8 +2848,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(FuncArg,
-                                              CurArgOffset),
+                                            MachinePointerInfo(FuncArg),
                                             ObjType, false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -2690,7 +2872,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       MachinePointerInfo(FuncArg, j),
                                        false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -2705,11 +2887,16 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
     switch (ObjectVT.getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
     case MVT::i32:
       if (!isPPC64) {
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+
+          if (ObjectVT == MVT::i1)
+            ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
+
           ++GPR_idx;
         } else {
           needsLoad = true;
@@ -2725,7 +2912,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
-        if (ObjectVT == MVT::i32)
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
@@ -2888,7 +3075,8 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
     EVT ArgVT = Outs[i].VT;
     // Varargs Altivec parameters are padded to a 16 byte boundary.
     if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
-        ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8) {
+        ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8 ||
+        ArgVT==MVT::v2f64 || ArgVT==MVT::v2i64) {
       if (!isVarArg && !isPPC64) {
         // Non-varargs Altivec parameters go after all the non-Altivec
         // parameters; handle those later so we know how much padding we need.
@@ -2968,7 +3156,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        if (Flags.isByVal()) return false;
     }
 
-    // Non PIC/GOT  tail calls are supported.
+    // Non-PIC/GOT tail calls are supported.
     if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
       return true;
 
@@ -3706,6 +3894,9 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     }
 
     if (VA.isRegLoc()) {
+      if (Arg.getValueType() == MVT::i1)
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);
+
       seenFloatArg |= VA.getLocVT().isFloatingPoint();
       // Put argument in a physical register.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
@@ -3863,6 +4054,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
+  static const uint16_t VSRH[] = {
+    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
+    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
+  };
+
   const unsigned NumGPRs = array_lengthof(GPR);
   const unsigned NumFPRs = 13;
   const unsigned NumVRs  = array_lengthof(VR);
@@ -3884,7 +4080,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
 
     // Promote integers to 64-bit values.
-    if (Arg.getValueType() == MVT::i32) {
+    if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
       // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
       unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
       Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
@@ -4008,6 +4204,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
 
     switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i1:
     case MVT::i32:
     case MVT::i64:
       if (GPR_idx != NumGPRs) {
@@ -4068,6 +4265,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
       if (isVarArg) {
         // These go aligned on the stack, or in the corresponding R registers
         // when within range.  The Darwin PPC ABI doc claims they also go in
@@ -4091,7 +4290,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                      MachinePointerInfo(),
                                      false, false, false, 0);
           MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+
+          unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
+                           Arg.getSimpleValueType() == MVT::v2i64) ?
+                          VSRH[VR_idx] : VR[VR_idx];
+          ++VR_idx;
+
+          RegsToPass.push_back(std::make_pair(VReg, Load));
         }
         ArgOffset += 16;
         for (unsigned i=0; i<16; i+=PtrByteSize) {
@@ -4111,7 +4316,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       // stack space allocated at the end.
       if (VR_idx != NumVRs) {
         // Doesn't have GPR space allocated.
-        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+        unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
+                         Arg.getSimpleValueType() == MVT::v2i64) ?
+                        VSRH[VR_idx] : VR[VR_idx];
+        ++VR_idx;
+
+        RegsToPass.push_back(std::make_pair(VReg, Arg));
       } else {
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
@@ -4339,9 +4549,13 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
     switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i1:
     case MVT::i32:
     case MVT::i64:
       if (GPR_idx != NumGPRs) {
+        if (Arg.getValueType() == MVT::i1)
+          Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
+
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
       } else {
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
@@ -4693,6 +4907,55 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                      Op.getOperand(0), Op.getOperand(1));
 }
 
+SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 loads");
+
+  // First, load 8 bits into 32 bits, then truncate to 1 bit.
+
+  SDLoc dl(Op);
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand *MMO = LD->getMemOperand();
+
+  SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain,
+                                 BasePtr, MVT::i8, MMO);
+  SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
+
+  SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOperand(1).getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 stores");
+
+  // First, zero extend to 32 bits, then use a truncating store to 8 bits.
+
+  SDLoc dl(Op);
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  MachineMemOperand *MMO = ST->getMemOperand();
+
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value);
+  return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
+}
+
+// FIXME: Remove this once the ANDI glue bug is fixed:
+SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 results");
+
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
+                     Op.getOperand(0));
+}
+
 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
 /// possible.
 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -4859,6 +5122,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
 
+  if (Op.getOperand(0).getValueType() == MVT::i1)
+    return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
+                       DAG.getConstantFP(1.0, Op.getValueType()),
+                       DAG.getConstantFP(0.0, Op.getValueType()));
+
   assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) &&
          "UINT_TO_FP is supported only with FPCVT");
 
@@ -5686,6 +5954,30 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   return Flags;
 }
 
+SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
+  // instructions), but for smaller types, we need to first extend up to v2i32
+  // before doing going farther.
+  if (Op.getValueType() == MVT::v2i64) {
+    EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    if (ExtVT != MVT::v2i32) {
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
+      Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
+                       DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
+                                        ExtVT.getVectorElementType(), 4)));
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
+      Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
+                       DAG.getValueType(MVT::v2i32));
+    }
+
+    return Op;
+  }
+
+  return SDValue();
+}
+
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -5792,6 +6084,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
 
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
@@ -5810,6 +6105,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
 
   // For counter-based loop handling.
@@ -5915,8 +6211,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   F->insert(It, loopMBB);
   F->insert(It, exitMBB);
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
@@ -5984,8 +6279,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   F->insert(It, loopMBB);
   F->insert(It, exitMBB);
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
@@ -6137,7 +6431,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // Note that the structure of the jmp_buf used here is not compatible
@@ -6357,9 +6651,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   MachineFunction *F = BB->getParent();
 
   if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-                                 MI->getOpcode() == PPC::SELECT_CC_I8)) {
+                                 MI->getOpcode() == PPC::SELECT_CC_I8 ||
+                                 MI->getOpcode() == PPC::SELECT_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_I8)) {
     SmallVector<MachineOperand, 2> Cond;
-    Cond.push_back(MI->getOperand(4));
+    if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+        MI->getOpcode() == PPC::SELECT_CC_I8)
+      Cond.push_back(MI->getOperand(4));
+    else
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
     Cond.push_back(MI->getOperand(1));
 
     DebugLoc dl = MI->getDebugLoc();
@@ -6371,9 +6671,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
              MI->getOpcode() == PPC::SELECT_CC_I8 ||
              MI->getOpcode() == PPC::SELECT_CC_F4 ||
              MI->getOpcode() == PPC::SELECT_CC_F8 ||
-             MI->getOpcode() == PPC::SELECT_CC_VRRC) {
-
-
+             MI->getOpcode() == PPC::SELECT_CC_VRRC ||
+             MI->getOpcode() == PPC::SELECT_I4 ||
+             MI->getOpcode() == PPC::SELECT_I8 ||
+             MI->getOpcode() == PPC::SELECT_F4 ||
+             MI->getOpcode() == PPC::SELECT_F8 ||
+             MI->getOpcode() == PPC::SELECT_VRRC) {
     // The incoming instruction knows the destination vreg to set, the
     // condition code register to branch on, the true/false values to
     // select between, and a branch opcode to use.
@@ -6387,23 +6690,31 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineBasicBlock *thisMBB = BB;
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    unsigned SelectPred = MI->getOperand(4).getImm();
     DebugLoc dl = MI->getDebugLoc();
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     // Next, add the true and fallthrough blocks as its successors.
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
-    BuildMI(BB, dl, TII->get(PPC::BCC))
-      .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    if (MI->getOpcode() == PPC::SELECT_I4 ||
+        MI->getOpcode() == PPC::SELECT_I8 ||
+        MI->getOpcode() == PPC::SELECT_F4 ||
+        MI->getOpcode() == PPC::SELECT_F8 ||
+        MI->getOpcode() == PPC::SELECT_VRRC) {
+      BuildMI(BB, dl, TII->get(PPC::BC))
+        .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    } else {
+      unsigned SelectPred = MI->getOperand(4).getImm();
+      BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    }
 
     //  copy0MBB:
     //   %FalseValue = ...
@@ -6505,8 +6816,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->insert(It, midMBB);
     F->insert(It, exitMBB);
     exitMBB->splice(exitMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
     exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     //  thisMBB:
@@ -6576,8 +6886,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     F->insert(It, midMBB);
     F->insert(It, exitMBB);
     exitMBB->splice(exitMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
     exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     MachineRegisterInfo &RegInfo = F->getRegInfo();
@@ -6726,6 +7035,27 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
+  } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+             MI->getOpcode() == PPC::ANDIo_1_GT_BIT ||
+             MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+             MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) {
+    unsigned Opcode = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+                       MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) ?
+                      PPC::ANDIo8 : PPC::ANDIo;
+    bool isEQ = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+                 MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
+                                                  &PPC::GPRCRegClass :
+                                                  &PPC::G8RCRegClass);
+
+    DebugLoc dl   = MI->getDebugLoc();
+    BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
+      .addReg(MI->getOperand(1).getReg()).addImm(1);
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
+            MI->getOperand(0).getReg())
+      .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
@@ -6747,7 +7077,8 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
 
   if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) ||
       (VT == MVT::f64 && PPCSubTarget.hasFRE())  ||
-      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) {
 
     // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
     // For the reciprocal, we need to find the zero of the function:
@@ -6809,7 +7140,8 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
 
   if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
       (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE())  ||
-      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) {
 
     // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
     // For the reciprocal sqrt, we need to find the zero of the function:
@@ -6980,6 +7312,536 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   return false;
 }
 
+SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  assert(PPCSubTarget.useCRBits() &&
+         "Expecting to be tracking CR bits");
+  // If we're tracking CR bits, we need to be careful that we don't have:
+  //   trunc(binary-ops(zext(x), zext(y)))
+  // or
+  //   trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
+  // such that we're unnecessarily moving things into GPRs when it would be
+  // better to keep them in CR bits.
+
+  // Note that trunc here can be an actual i1 trunc, or can be the effective
+  // truncation that comes from a setcc or select_cc.
+  if (N->getOpcode() == ISD::TRUNCATE &&
+      N->getValueType(0) != MVT::i1)
+    return SDValue();
+
+  if (N->getOperand(0).getValueType() != MVT::i32 &&
+      N->getOperand(0).getValueType() != MVT::i64)
+    return SDValue();
+
+  if (N->getOpcode() == ISD::SETCC ||
+      N->getOpcode() == ISD::SELECT_CC) {
+    // If we're looking at a comparison, then we need to make sure that the
+    // high bits (all except for the first) don't matter the result.
+    ISD::CondCode CC =
+      cast<CondCodeSDNode>(N->getOperand(
+        N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
+    unsigned OpBits = N->getOperand(0).getValueSizeInBits();
+
+    if (ISD::isSignedIntSetCC(CC)) {
+      if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
+          DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
+        return SDValue();
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (!DAG.MaskedValueIsZero(N->getOperand(0),
+                                 APInt::getHighBitsSet(OpBits, OpBits-1)) ||
+          !DAG.MaskedValueIsZero(N->getOperand(1),
+                                 APInt::getHighBitsSet(OpBits, OpBits-1)))
+        return SDValue();
+    } else {
+      // This is neither a signed nor an unsigned comparison, just make sure
+      // that the high bits are equal.
+      APInt Op1Zero, Op1One;
+      APInt Op2Zero, Op2One;
+      DAG.ComputeMaskedBits(N->getOperand(0), Op1Zero, Op1One);
+      DAG.ComputeMaskedBits(N->getOperand(1), Op2Zero, Op2One);
+
+      // We don't really care about what is known about the first bit (if
+      // anything), so clear it in all masks prior to comparing them.
+      Op1Zero.clearBit(0); Op1One.clearBit(0);
+      Op2Zero.clearBit(0); Op2One.clearBit(0);
+
+      if (Op1Zero != Op2Zero || Op1One != Op2One)
+        return SDValue();
+    }
+  }
+
+  // We now know that the higher-order bits are irrelevant, we just need to
+  // make sure that all of the intermediate operations are bit operations, and
+  // all inputs are extensions.
+  if (N->getOperand(0).getOpcode() != ISD::AND &&
+      N->getOperand(0).getOpcode() != ISD::OR  &&
+      N->getOperand(0).getOpcode() != ISD::XOR &&
+      N->getOperand(0).getOpcode() != ISD::SELECT &&
+      N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
+      N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
+      N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+
+  if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
+      N->getOperand(1).getOpcode() != ISD::AND &&
+      N->getOperand(1).getOpcode() != ISD::OR  &&
+      N->getOperand(1).getOpcode() != ISD::XOR &&
+      N->getOperand(1).getOpcode() != ISD::SELECT &&
+      N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
+      N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
+      N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+
+  SmallVector<SDValue, 4> Inputs;
+  SmallVector<SDValue, 8> BinOps, PromOps;
+  SmallPtrSet<SDNode *, 16> Visited;
+
+  for (unsigned i = 0; i < 2; ++i) {
+    if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+          N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+        isa<ConstantSDNode>(N->getOperand(i)))
+      Inputs.push_back(N->getOperand(i));
+    else
+      BinOps.push_back(N->getOperand(i));
+
+    if (N->getOpcode() == ISD::TRUNCATE)
+      break;
+  }
+
+  // Visit all inputs, collect all binary operations (and, or, xor and
+  // select) that are all fed by extensions. 
+  while (!BinOps.empty()) {
+    SDValue BinOp = BinOps.back();
+    BinOps.pop_back();
+
+    if (!Visited.insert(BinOp.getNode()))
+      continue;
+
+    PromOps.push_back(BinOp);
+
+    for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+      // The condition of the select is not promoted.
+      if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+        continue;
+      if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+        continue;
+
+      if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+            BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+            BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+           BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+          isa<ConstantSDNode>(BinOp.getOperand(i))) {
+        Inputs.push_back(BinOp.getOperand(i)); 
+      } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::OR  ||
+                 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
+                 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
+        BinOps.push_back(BinOp.getOperand(i));
+      } else {
+        // We have an input that is not an extension or another binary
+        // operation; we'll abort this transformation.
+        return SDValue();
+      }
+    }
+  }
+
+  // Make sure that this is a self-contained cluster of operations (which
+  // is not quite the same thing as saying that everything has only one
+  // use).
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+                              UE = Inputs[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == Inputs[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == Inputs[i] ||
+            User->getOperand(1) == Inputs[i])
+          return SDValue();
+      }
+    }
+  }
+
+  for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+                              UE = PromOps[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == PromOps[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == PromOps[i] ||
+            User->getOperand(1) == PromOps[i])
+          return SDValue();
+      }
+    }
+  }
+
+  // Replace all inputs with the extension operand.
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    // Constants may have users outside the cluster of to-be-promoted nodes,
+    // and so we need to replace those as we do the promotions.
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+    else
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 
+  }
+
+  // Replace all operations (these are all the same, but have a different
+  // (i1) return type). DAG.getNode will validate that the types of
+  // a binary operator match, so go through the list in reverse so that
+  // we've likely promoted both operands first. Any intermediate truncations or
+  // extensions disappear.
+  while (!PromOps.empty()) {
+    SDValue PromOp = PromOps.back();
+    PromOps.pop_back();
+
+    if (PromOp.getOpcode() == ISD::TRUNCATE ||
+        PromOp.getOpcode() == ISD::SIGN_EXTEND ||
+        PromOp.getOpcode() == ISD::ZERO_EXTEND ||
+        PromOp.getOpcode() == ISD::ANY_EXTEND) {
+      if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
+          PromOp.getOperand(0).getValueType() != MVT::i1) {
+        // The operand is not yet ready (see comment below).
+        PromOps.insert(PromOps.begin(), PromOp);
+        continue;
+      }
+
+      SDValue RepValue = PromOp.getOperand(0);
+      if (isa<ConstantSDNode>(RepValue))
+        RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
+
+      DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
+      continue;
+    }
+
+    unsigned C;
+    switch (PromOp.getOpcode()) {
+    default:             C = 0; break;
+    case ISD::SELECT:    C = 1; break;
+    case ISD::SELECT_CC: C = 2; break;
+    }
+
+    if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+         PromOp.getOperand(C).getValueType() != MVT::i1) ||
+        (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+         PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
+      // The to-be-promoted operands of this node have not yet been
+      // promoted (this should be rare because we're going through the
+      // list backward, but if one of the operands has several users in
+      // this cluster of to-be-promoted nodes, it is possible).
+      PromOps.insert(PromOps.begin(), PromOp);
+      continue;
+    }
+
+    SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+                                PromOp.getNode()->op_end());
+
+    // If there are any constant inputs, make sure they're replaced now.
+    for (unsigned i = 0; i < 2; ++i)
+      if (isa<ConstantSDNode>(Ops[C+i]))
+        Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
+
+    DAG.ReplaceAllUsesOfValueWith(PromOp,
+      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1,
+                  Ops.data(), Ops.size()));
+  }
+
+  // Now we're left with the initial truncation itself.
+  if (N->getOpcode() == ISD::TRUNCATE)
+    return N->getOperand(0);
+
+  // Otherwise, this is a comparison. The operands to be compared have just
+  // changed type (to i1), but everything else is the same.
+  return SDValue(N, 0);
+}
+
+SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  // If we're tracking CR bits, we need to be careful that we don't have:
+  //   zext(binary-ops(trunc(x), trunc(y)))
+  // or
+  //   zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
+  // such that we're unnecessarily moving things into CR bits that can more
+  // efficiently stay in GPRs. Note that if we're not certain that the high
+  // bits are set as required by the final extension, we still may need to do
+  // some masking to get the proper behavior.
+
+  // This same functionality is important on PPC64 when dealing with
+  // 32-to-64-bit extensions; these occur often when 32-bit values are used as
+  // the return values of functions. Because it is so similar, it is handled
+  // here as well.
+
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  if (!((N->getOperand(0).getValueType() == MVT::i1 &&
+        PPCSubTarget.useCRBits()) ||
+       (N->getOperand(0).getValueType() == MVT::i32 &&
+        PPCSubTarget.isPPC64())))
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::AND &&
+      N->getOperand(0).getOpcode() != ISD::OR  &&
+      N->getOperand(0).getOpcode() != ISD::XOR &&
+      N->getOperand(0).getOpcode() != ISD::SELECT &&
+      N->getOperand(0).getOpcode() != ISD::SELECT_CC)
+    return SDValue();
+
+  SmallVector<SDValue, 4> Inputs;
+  SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
+  SmallPtrSet<SDNode *, 16> Visited;
+
+  // Visit all inputs, collect all binary operations (and, or, xor and
+  // select) that are all fed by truncations. 
+  while (!BinOps.empty()) {
+    SDValue BinOp = BinOps.back();
+    BinOps.pop_back();
+
+    if (!Visited.insert(BinOp.getNode()))
+      continue;
+
+    PromOps.push_back(BinOp);
+
+    for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+      // The condition of the select is not promoted.
+      if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+        continue;
+      if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+        continue;
+
+      if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+          isa<ConstantSDNode>(BinOp.getOperand(i))) {
+        Inputs.push_back(BinOp.getOperand(i)); 
+      } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::OR  ||
+                 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
+        BinOps.push_back(BinOp.getOperand(i));
+      } else {
+        // We have an input that is not a truncation or another binary
+        // operation; we'll abort this transformation.
+        return SDValue();
+      }
+    }
+  }
+
+  // Make sure that this is a self-contained cluster of operations (which
+  // is not quite the same thing as saying that everything has only one
+  // use).
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+                              UE = Inputs[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == Inputs[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == Inputs[i] ||
+            User->getOperand(1) == Inputs[i])
+          return SDValue();
+      }
+    }
+  }
+
+  for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+                              UE = PromOps[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == PromOps[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == PromOps[i] ||
+            User->getOperand(1) == PromOps[i])
+          return SDValue();
+      }
+    }
+  }
+
+  unsigned PromBits = N->getOperand(0).getValueSizeInBits();
+  bool ReallyNeedsExt = false;
+  if (N->getOpcode() != ISD::ANY_EXTEND) {
+    // If all of the inputs are not already sign/zero extended, then
+    // we'll still need to do that at the end.
+    for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+      if (isa<ConstantSDNode>(Inputs[i]))
+        continue;
+
+      unsigned OpBits =
+        Inputs[i].getOperand(0).getValueSizeInBits();
+      assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
+
+      if ((N->getOpcode() == ISD::ZERO_EXTEND &&
+           !DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
+                                  APInt::getHighBitsSet(OpBits,
+                                                        OpBits-PromBits))) ||
+          (N->getOpcode() == ISD::SIGN_EXTEND &&
+           DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
+             (OpBits-(PromBits-1)))) {
+        ReallyNeedsExt = true;
+        break;
+      }
+    }
+  }
+
+  // Replace all inputs, either with the truncation operand, or a
+  // truncation or extension to the final output type.
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    // Constant inputs need to be replaced with the to-be-promoted nodes that
+    // use them because they might have users outside of the cluster of
+    // promoted nodes.
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    SDValue InSrc = Inputs[i].getOperand(0);
+    if (Inputs[i].getValueType() == N->getValueType(0))
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
+    else if (N->getOpcode() == ISD::SIGN_EXTEND)
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
+    else
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
+  }
+
+  // Replace all operations (these are all the same, but have a different
+  // (promoted) return type). DAG.getNode will validate that the types of
+  // a binary operator match, so go through the list in reverse so that
+  // we've likely promoted both operands first.
+  while (!PromOps.empty()) {
+    SDValue PromOp = PromOps.back();
+    PromOps.pop_back();
+
+    unsigned C;
+    switch (PromOp.getOpcode()) {
+    default:             C = 0; break;
+    case ISD::SELECT:    C = 1; break;
+    case ISD::SELECT_CC: C = 2; break;
+    }
+
+    if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+         PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
+        (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+         PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
+      // The to-be-promoted operands of this node have not yet been
+      // promoted (this should be rare because we're going through the
+      // list backward, but if one of the operands has several users in
+      // this cluster of to-be-promoted nodes, it is possible).
+      PromOps.insert(PromOps.begin(), PromOp);
+      continue;
+    }
+
+    SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+                                PromOp.getNode()->op_end());
+
+    // If this node has constant inputs, then they'll need to be promoted here.
+    for (unsigned i = 0; i < 2; ++i) {
+      if (!isa<ConstantSDNode>(Ops[C+i]))
+        continue;
+      if (Ops[C+i].getValueType() == N->getValueType(0))
+        continue;
+
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+      else if (N->getOpcode() == ISD::ZERO_EXTEND)
+        Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+      else
+        Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+    }
+
+    DAG.ReplaceAllUsesOfValueWith(PromOp,
+      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0),
+                  Ops.data(), Ops.size()));
+  }
+
+  // Now we're left with the initial extension itself.
+  if (!ReallyNeedsExt)
+    return N->getOperand(0);
+
+  // To zero extend, just mask off everything except for the first bit (in the
+  // i1 case).
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
+                       DAG.getConstant(APInt::getLowBitsSet(
+                                         N->getValueSizeInBits(0), PromBits),
+                                       N->getValueType(0)));
+
+  assert(N->getOpcode() == ISD::SIGN_EXTEND &&
+         "Invalid extension type");
+  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0));
+  SDValue ShiftCst =
+    DAG.getConstant(N->getValueSizeInBits(0)-PromBits, ShiftAmountTy);
+  return DAG.getNode(ISD::SRA, dl, N->getValueType(0), 
+                     DAG.getNode(ISD::SHL, dl, N->getValueType(0),
+                                 N->getOperand(0), ShiftCst), ShiftCst);
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   const TargetMachine &TM = getTargetMachine();
@@ -7006,6 +7868,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return N->getOperand(0);
     }
     break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: 
+    return DAGCombineExtBoolTrunc(N, DCI);
+  case ISD::TRUNCATE:
+  case ISD::SETCC:
+  case ISD::SELECT_CC:
+    return DAGCombineTruncBoolExt(N, DCI);
   case ISD::FDIV: {
     assert(TM.Options.UnsafeFPMath &&
            "Reciprocal estimates require UnsafeFPMath");
@@ -7204,7 +8074,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // you might suspect (sizeof(vector) bytes after the last requested
       // load), but rather sizeof(vector) - 1 bytes after the last
       // requested vector. The point of this is to avoid a page fault if the
-      // base address happend to be aligned. This works because if the base
+      // base address happened to be aligned. This works because if the base
       // address is aligned, then adding less than a full vector length will
       // cause the last vector in the sequence to be (re)loaded. Otherwise,
       // the next vector will be fetched as you might suspect was necessary.
@@ -7421,6 +8291,25 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   }
+  case ISD::BRCOND: {
+    SDValue Cond = N->getOperand(1);
+    SDValue Target = N->getOperand(2);
+ 
+    if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero) {
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
+      assert(Cond.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
+                         N->getOperand(0), Target);
+    }
+  }
+  break;
   case ISD::BR_CC: {
     // If this is a branch on an altivec predicate comparison, lower this so
     // that we don't have to do a MFOCRF: instead, branch directly on CR6.  This
@@ -7585,6 +8474,11 @@ PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
       // suboptimal.
       return C_Memory;
     }
+  } else if (Constraint == "wc") { // individual CR bits.
+    return C_RegisterClass;
+  } else if (Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf" || Constraint == "ws") {
+    return C_RegisterClass; // VSX registers.
   }
   return TargetLowering::getConstraintType(Constraint);
 }
@@ -7602,7 +8496,18 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
   if (CallOperandVal == NULL)
     return CW_Default;
   Type *type = CallOperandVal->getType();
+
   // Look at the constraint type.
+  if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
+    return CW_Register; // an individual CR bit.
+  else if ((StringRef(constraint) == "wa" ||
+            StringRef(constraint) == "wd" ||
+            StringRef(constraint) == "wf") &&
+           type->isVectorTy())
+    return CW_Register;
+  else if (StringRef(constraint) == "ws" && type->isDoubleTy())
+    return CW_Register;
+
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
@@ -7658,6 +8563,13 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'y':   // crrc
       return std::make_pair(0U, &PPC::CRRCRegClass);
     }
+  } else if (Constraint == "wc") { // an individual CR bit.
+    return std::make_pair(0U, &PPC::CRBITRCRegClass);
+  } else if (Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf") {
+    return std::make_pair(0U, &PPC::VSRCRegClass);
+  } else if (Constraint == "ws") {
+    return std::make_pair(0U, &PPC::VSFRCRegClass);
   }
 
   std::pair<unsigned, const TargetRegisterClass*> R =
@@ -7793,6 +8705,9 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
@@ -7881,6 +8796,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
 }
 
 bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                      unsigned,
                                                       bool *Fast) const {
   if (DisablePPCUnaligned)
     return false;
@@ -7894,8 +8810,14 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple())
     return false;
 
-  if (VT.getSimpleVT().isVector())
-    return false;
+  if (VT.getSimpleVT().isVector()) {
+    if (PPCSubTarget.hasVSX()) {
+      if (VT != MVT::v2f64 && VT != MVT::v2i64)
+        return false;
+    } else {
+      return false;
+    }
+  }
 
   if (VT == MVT::ppcf128)
     return false;
@@ -7923,6 +8845,15 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   return false;
 }
 
+bool
+PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
+                     EVT VT , unsigned DefinedValues) const {
+  if (VT == MVT::v2i64)
+    return false;
+
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
   if (DisableILPPref || PPCSubTarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index df3af35..da6d4dc 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -19,8 +19,8 @@
 #include "PPCInstrInfo.h"
 #include "PPCRegisterInfo.h"
 #include "PPCSubtarget.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -121,6 +121,12 @@ namespace llvm {
       /// resultant GPR.  Bits corresponding to other CR regs are undefined.
       MFOCRF,
 
+      // FIXME: Remove these once the ANDI glue bug is fixed:
+      /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+      /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+      /// implement truncation of i32 or i64 to i1.
+      ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT,
+
       // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
       EH_SJLJ_SETJMP,
 
@@ -177,6 +183,10 @@ namespace llvm {
       CR6SET,
       CR6UNSET,
 
+      /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+      /// on PPC32.
+      PPC32_GOT,
+
       /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
       /// TLS model, produces an ADDIS8 instruction that adds the GOT
       /// base to sym\@got\@tprel\@ha.
@@ -457,7 +467,9 @@ namespace llvm {
 
     /// Is unaligned memory access allowed for the given type, and is it fast
     /// relative to software emulation.
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast = 0) const;
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT,
+                                               unsigned AddrSpace,
+                                               bool *Fast = 0) const;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
@@ -465,6 +477,11 @@ namespace llvm {
     /// expanded to fmul + fadd.
     virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
 
+    // Should we expand the build vector with shuffles?
+    virtual bool
+    shouldExpandBuildVectorWithShuffles(EVT VT,
+                                        unsigned DefinedValues) const;
+
     /// createFastISel - This method returns a target-specific FastISel object,
     /// or null if the target does not support "fast" instruction selection.
     virtual FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
@@ -509,6 +526,9 @@ namespace llvm {
                                 const PPCSubtarget &Subtarget) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
@@ -520,6 +540,7 @@ namespace llvm {
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -625,6 +646,8 @@ namespace llvm {
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 46db4fe..b71c09e 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -19,11 +19,13 @@ def s16imm64 : Operand<i64> {
   let PrintMethod = "printS16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
 }
 def u16imm64 : Operand<i64> {
   let PrintMethod = "printU16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
 }
 def s17imm64 : Operand<i64> {
   // This operand type is used for addis/lis to allow the assembler parser
@@ -32,14 +34,11 @@ def s17imm64 : Operand<i64> {
   let PrintMethod = "printS16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
 }
 def tocentry : Operand<iPTR> {
   let MIOperandInfo = (ops i64imm:$imm);
 }
-def PPCTLSRegOperand : AsmOperandClass {
-  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
-  let RenderMethod = "addTLSRegOperands";
-}
 def tlsreg : Operand<i64> {
   let EncoderMethod = "getTLSRegEncoding";
   let ParserMatchClass = PPCTLSRegOperand;
@@ -80,15 +79,22 @@ def HI48_64 : SDNodeXForm<imm, [{
 // Calls.
 //
 
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
   let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in {
-    def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+    def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                             []>,
+        Requires<[In64BitMode]>;
+    def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+                              "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+                              []>,
         Requires<[In64BitMode]>;
 
-    let isCodeGenOnly = 1 in
-    def BCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
-                             "b${cond:cc}ctr${cond:pm} ${cond:reg}", BrB, []>,
+    def BCCTR8  : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+                               "bcctr 12, $bi, 0", IIC_BrB, []>,
+        Requires<[In64BitMode]>;
+    def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+                               "bcctr 4, $bi, 0", IIC_BrB, []>,
         Requires<[In64BitMode]>;
   }
 }
@@ -107,9 +113,9 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
 
   let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in {
     def BDZLR8  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
-                              "bdzlr", BrB, []>;
+                              "bdzlr", IIC_BrB, []>;
     def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
-                              "bdnzlr", BrB, []>;
+                              "bdnzlr", IIC_BrB, []>;
   }
 }
 
@@ -119,41 +125,58 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
-                     "bl $func", BrB, []>;  // See Pat patterns below.
+                     "bl $func", IIC_BrB, []>;  // See Pat patterns below.
 
     def BL8_TLS  : IForm<18, 0, 1, (outs), (ins tlscall:$func),
-                         "bl $func", BrB, []>;
+                         "bl $func", IIC_BrB, []>;
 
     def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
-                     "bla $func", BrB, [(PPCcall (i64 imm:$func))]>;
+                     "bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>;
   }
   let Uses = [RM], isCodeGenOnly = 1 in {
     def BL8_NOP  : IForm_and_DForm_4_zero<18, 0, 1, 24,
                              (outs), (ins calltarget:$func),
-                             "bl $func\n\tnop", BrB, []>;
+                             "bl $func\n\tnop", IIC_BrB, []>;
 
     def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24,
                                   (outs), (ins tlscall:$func),
-                                  "bl $func\n\tnop", BrB, []>;
+                                  "bl $func\n\tnop", IIC_BrB, []>;
 
     def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
                              (outs), (ins abscalltarget:$func),
-                             "bla $func\n\tnop", BrB,
+                             "bla $func\n\tnop", IIC_BrB,
                              [(PPCcall_nop (i64 imm:$func))]>;
   }
   let Uses = [CTR8, RM] in {
     def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
-                              "bctrl", BrB, [(PPCbctrl)]>,
+                              "bctrl", IIC_BrB, [(PPCbctrl)]>,
                  Requires<[In64BitMode]>;
 
-    let isCodeGenOnly = 1 in
-    def BCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
-                              "b${cond:cc}ctrl${cond:pm} ${cond:reg}", BrB, []>,
-        Requires<[In64BitMode]>;
+    let isCodeGenOnly = 1 in {
+      def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+                                 "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+                                 []>,
+          Requires<[In64BitMode]>;
+
+      def BCCTRL8  : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+                                  "bcctrl 12, $bi, 0", IIC_BrB, []>,
+          Requires<[In64BitMode]>;
+      def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+                                  "bcctrl 4, $bi, 0", IIC_BrB, []>,
+          Requires<[In64BitMode]>;
+    }
   }
 }
 } // Interpretation64Bit
 
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let Interpretation64Bit = 1, isAsmParserOnly = 1 in
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in
+def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+                     "bl $func", IIC_BrB, []>;
+
 // Calls
 def : Pat<(PPCcall (i64 tglobaladdr:$dst)),
           (BL8 tglobaladdr:$dst)>;
@@ -199,16 +222,16 @@ let usesCustomInserter = 1 in {
 
 // Instructions to support atomic operations
 def LDARX : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
-                   "ldarx $rD, $ptr", LdStLDARX,
+                   "ldarx $rD, $ptr", IIC_LdStLDARX,
                    [(set i64:$rD, (PPClarx xoaddr:$ptr))]>;
 
 let Defs = [CR0] in
 def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stdcx. $rS, $dst", LdStSTDCX,
+                   "stdcx. $rS, $dst", IIC_LdStSTDCX,
                    [(PPCstcx i64:$rS, xoaddr:$dst)]>,
                    isDOT;
 
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi8 :Pseudo< (outs),
                         (ins calltarget:$dst, i32imm:$offset),
@@ -225,28 +248,23 @@ def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
                  "#TC_RETURNr8 $dst $offset",
                  []>;
 
-let isCodeGenOnly = 1 in {
-
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
     isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
-def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                             []>,
     Requires<[In64BitMode]>;
 
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
 def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
-                  "b $dst", BrB,
+                  "b $dst", IIC_BrB,
                   []>;
 
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
 def TAILBA8   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
-                  "ba $dst", BrB,
+                  "ba $dst", IIC_BrB,
                   []>;
-
-}
 } // Interpretation64Bit
 
 def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
@@ -260,23 +278,23 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
 
 
 // 64-bit CR instructions
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 let neverHasSideEffects = 1 in {
 def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
-                        "mtocrf $FXM, $ST", BrMCRX>,
+                        "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
 def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
-                      "mtcrf $FXM, $rS", BrMCRX>,
+                      "mtcrf $FXM, $rS", IIC_BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
 let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
-                        "mfocrf $rT, $FXM", SprMFCR>,
+                        "mfocrf $rT, $FXM", IIC_SprMFCRF>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
 
 def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
-                     "mfcr $rT", SprMFCR>,
+                     "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
 } // neverHasSideEffects = 1
 
@@ -298,24 +316,24 @@ let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
 
 let Uses = [CTR8] in {
 def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins),
-                           "mfctr $rT", SprMFSPR>,
+                           "mfctr $rT", IIC_SprMFSPR>,
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in {
 def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
-                           "mtctr $rS", SprMTSPR>,
+                           "mtctr $rS", IIC_SprMTSPR>,
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
-let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR8] in {
+let hasSideEffects = 1, Defs = [CTR8] in {
 let Pattern = [(int_ppc_mtctr i64:$rS)] in
 def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
-                               "mtctr $rS", SprMTSPR>,
+                               "mtctr $rS", IIC_SprMTSPR>,
                  PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
-let isCodeGenOnly = 1, Pattern = [(set i64:$rT, readcyclecounter)] in
+let Pattern = [(set i64:$rT, readcyclecounter)] in
 def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
-                          "mfspr $rT, 268", SprMFTB>,
+                          "mfspr $rT, 268", IIC_SprMFTB>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 // Note that encoding mftb using mfspr is now the preferred form,
 // and has been since at least ISA v2.03. The mftb instruction has
@@ -329,12 +347,12 @@ def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#D
 
 let Defs = [LR8] in {
 def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
-                           "mtlr $rS", SprMTSPR>,
+                           "mtlr $rS", IIC_SprMTSPR>,
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let Uses = [LR8] in {
 def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
-                           "mflr $rT", SprMFSPR>,
+                           "mflr $rT", IIC_SprMFSPR>,
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 } // Interpretation64Bit
@@ -346,213 +364,236 @@ def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
 let PPC970_Unit = 1 in {  // FXU Operations.
 let Interpretation64Bit = 1 in {
 let neverHasSideEffects = 1 in {
+let isCodeGenOnly = 1 in {
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
-                      "li $rD, $imm", IntSimple,
+                      "li $rD, $imm", IIC_IntSimple,
                       [(set i64:$rD, imm64SExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm),
-                      "lis $rD, $imm", IntSimple,
+                      "lis $rD, $imm", IIC_IntSimple,
                       [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
 }
 
 // Logical ops.
+let isCommutable = 1 in {
 defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "nand", "$rA, $rS, $rB", IntSimple,
+                     "nand", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>;
 defm AND8 : XForm_6r<31,  28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "and", "$rA, $rS, $rB", IntSimple,
+                     "and", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (and i64:$rS, i64:$rB))]>;
+} // isCommutable
 defm ANDC8: XForm_6r<31,  60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "andc", "$rA, $rS, $rB", IntSimple,
+                     "andc", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
 defm OR8  : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "or", "$rA, $rS, $rB", IntSimple,
+                     "or", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (or i64:$rS, i64:$rB))]>;
 defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "nor", "$rA, $rS, $rB", IntSimple,
+                     "nor", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>;
+} // isCommutable
 defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "orc", "$rA, $rS, $rB", IntSimple,
+                     "orc", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
 defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "eqv", "$rA, $rS, $rB", IntSimple,
+                     "eqv", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>;
 defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
-                     "xor", "$rA, $rS, $rB", IntSimple,
+                     "xor", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i64:$rA, (xor i64:$rS, i64:$rB))]>;
+} // let isCommutable = 1
 
 // Logical ops with immediate.
 let Defs = [CR0] in {
-def ANDIo8  : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                      "andi. $dst, $src1, $src2", IntGeneral,
+def ANDIo8  : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "andi. $dst, $src1, $src2", IIC_IntGeneral,
                       [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>,
                       isDOT;
-def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                     "andis. $dst, $src1, $src2", IntGeneral,
+def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                     "andis. $dst, $src1, $src2", IIC_IntGeneral,
                     [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>,
                      isDOT;
 }
-def ORI8    : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                      "ori $dst, $src1, $src2", IntSimple,
+def ORI8    : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "ori $dst, $src1, $src2", IIC_IntSimple,
                       [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>;
-def ORIS8   : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                      "oris $dst, $src1, $src2", IntSimple,
+def ORIS8   : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "oris $dst, $src1, $src2", IIC_IntSimple,
                     [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>;
-def XORI8   : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                      "xori $dst, $src1, $src2", IntSimple,
+def XORI8   : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "xori $dst, $src1, $src2", IIC_IntSimple,
                       [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>;
-def XORIS8  : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                      "xoris $dst, $src1, $src2", IntSimple,
+def XORIS8  : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "xoris $dst, $src1, $src2", IIC_IntSimple,
                    [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
 
+let isCommutable = 1 in
 defm ADD8  : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "add", "$rT, $rA, $rB", IntSimple,
+                       "add", "$rT, $rA, $rB", IIC_IntSimple,
                        [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
 // ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
 // initial-exec thread-local storage model.
 def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
-                        "add $rT, $rA, $rB", IntSimple,
+                        "add $rT, $rA, $rB", IIC_IntSimple,
                         [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
                      
+let isCommutable = 1 in
 defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                        "addc", "$rT, $rA, $rB", IntGeneral,
+                        "addc", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i64:$rT, (addc i64:$rA, i64:$rB))]>,
                         PPC970_DGroup_Cracked;
+
 let Defs = [CARRY] in
 def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
-                     "addic $rD, $rA, $imm", IntGeneral,
+                     "addic $rD, $rA, $imm", IIC_IntGeneral,
                      [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
 def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
-                     "addi $rD, $rA, $imm", IntSimple,
+                     "addi $rD, $rA, $imm", IIC_IntSimple,
                      [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
 def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
-                     "addis $rD, $rA, $imm", IntSimple,
+                     "addis $rD, $rA, $imm", IIC_IntSimple,
                      [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
 
 let Defs = [CARRY] in {
 def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
-                     "subfic $rD, $rA, $imm", IntGeneral,
+                     "subfic $rD, $rA, $imm", IIC_IntGeneral,
                      [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
 defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                        "subfc", "$rT, $rA, $rB", IntGeneral,
+                        "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
                         PPC970_DGroup_Cracked;
 }
 defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "subf", "$rT, $rA, $rB", IntGeneral,
+                       "subf", "$rT, $rA, $rB", IIC_IntGeneral,
                        [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
 defm NEG8    : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA),
-                        "neg", "$rT, $rA", IntSimple,
+                        "neg", "$rT, $rA", IIC_IntSimple,
                         [(set i64:$rT, (ineg i64:$rA))]>;
 let Uses = [CARRY] in {
+let isCommutable = 1 in
 defm ADDE8   : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                          "adde", "$rT, $rA, $rB", IntGeneral,
+                          "adde", "$rT, $rA, $rB", IIC_IntGeneral,
                           [(set i64:$rT, (adde i64:$rA, i64:$rB))]>;
 defm ADDME8  : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA),
-                          "addme", "$rT, $rA", IntGeneral,
+                          "addme", "$rT, $rA", IIC_IntGeneral,
                           [(set i64:$rT, (adde i64:$rA, -1))]>;
 defm ADDZE8  : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA),
-                          "addze", "$rT, $rA", IntGeneral,
+                          "addze", "$rT, $rA", IIC_IntGeneral,
                           [(set i64:$rT, (adde i64:$rA, 0))]>;
 defm SUBFE8  : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                          "subfe", "$rT, $rA, $rB", IntGeneral,
+                          "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
                           [(set i64:$rT, (sube i64:$rB, i64:$rA))]>;
 defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA),
-                          "subfme", "$rT, $rA", IntGeneral,
+                          "subfme", "$rT, $rA", IIC_IntGeneral,
                           [(set i64:$rT, (sube -1, i64:$rA))]>;
 defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
-                          "subfze", "$rT, $rA", IntGeneral,
+                          "subfze", "$rT, $rA", IIC_IntGeneral,
                           [(set i64:$rT, (sube 0, i64:$rA))]>;
 }
+} // isCodeGenOnly
 
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let isAsmParserOnly = 1 in
+def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
+                        "add $rT, $rA, $rB", IIC_IntSimple, []>;
 
+let isCommutable = 1 in {
 defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "mulhd", "$rT, $rA, $rB", IntMulHW,
+                       "mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
                        [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>;
 defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "mulhdu", "$rT, $rA, $rB", IntMulHWU,
+                       "mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU,
                        [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>;
+} // isCommutable
 }
 } // Interpretation64Bit
 
 let isCompare = 1, neverHasSideEffects = 1 in {
   def CMPD   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
-                            "cmpd $crD, $rA, $rB", IntCompare>, isPPC64;
+                            "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
   def CMPLD  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
-                            "cmpld $crD, $rA, $rB", IntCompare>, isPPC64;
-  def CMPDI  : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm:$imm),
-                           "cmpdi $crD, $rA, $imm", IntCompare>, isPPC64;
-  def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm:$src2),
-                           "cmpldi $dst, $src1, $src2", IntCompare>, isPPC64;
+                            "cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+  def CMPDI  : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm),
+                           "cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64;
+  def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                           "cmpldi $dst, $src1, $src2",
+                           IIC_IntCompare>, isPPC64;
 }
 
 let neverHasSideEffects = 1 in {
 defm SLD  : XForm_6r<31,  27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
-                     "sld", "$rA, $rS, $rB", IntRotateD,
+                     "sld", "$rA, $rS, $rB", IIC_IntRotateD,
                      [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
 defm SRD  : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
-                     "srd", "$rA, $rS, $rB", IntRotateD,
+                     "srd", "$rA, $rS, $rB", IIC_IntRotateD,
                      [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64;
 defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
-                      "srad", "$rA, $rS, $rB", IntRotateD,
+                      "srad", "$rA, $rS, $rB", IIC_IntRotateD,
                       [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
 
-let Interpretation64Bit = 1 in { 
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in { 
 defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
-                        "extsb", "$rA, $rS", IntSimple,
+                        "extsb", "$rA, $rS", IIC_IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
 defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
-                        "extsh", "$rA, $rS", IntSimple,
+                        "extsh", "$rA, $rS", IIC_IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
 } // Interpretation64Bit
 
 // For fast-isel:
 let isCodeGenOnly = 1 in {
 def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
-                           "extsb $rA, $rS", IntSimple, []>, isPPC64;
+                           "extsb $rA, $rS", IIC_IntSimple, []>, isPPC64;
 def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
-                           "extsh $rA, $rS", IntSimple, []>, isPPC64;
+                           "extsh $rA, $rS", IIC_IntSimple, []>, isPPC64;
 } // isCodeGenOnly for fast-isel
 
 defm EXTSW  : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
-                        "extsw", "$rA, $rS", IntSimple,
+                        "extsw", "$rA, $rS", IIC_IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
-                             "extsw", "$rA, $rS", IntSimple,
+                             "extsw", "$rA, $rS", IIC_IntSimple,
                              [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
 
 defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
-                         "sradi", "$rA, $rS, $SH", IntRotateDI,
+                         "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
                          [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
 defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
-                        "cntlzd", "$rA, $rS", IntGeneral,
+                        "cntlzd", "$rA, $rS", IIC_IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
 def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
-                       "popcntd $rA, $rS", IntGeneral,
+                       "popcntd $rA, $rS", IIC_IntGeneral,
                        [(set i64:$rA, (ctpop i64:$rS))]>;
 
 // popcntw also does a population count on the high 32 bits (storing the
 // results in the high 32-bits of the output). We'll ignore that here (which is
 // safe because we never separately use the high part of the 64-bit registers).
 def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
-                       "popcntw $rA, $rS", IntGeneral,
+                       "popcntw $rA, $rS", IIC_IntGeneral,
                        [(set i32:$rA, (ctpop i32:$rS))]>;
 
 defm DIVD  : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "divd", "$rT, $rA, $rB", IntDivD,
+                       "divd", "$rT, $rA, $rB", IIC_IntDivD,
                        [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64,
                        PPC970_DGroup_First, PPC970_DGroup_Cracked;
 defm DIVDU : XOForm_1r<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "divdu", "$rT, $rA, $rB", IntDivD,
+                       "divdu", "$rT, $rA, $rB", IIC_IntDivD,
                        [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64,
                        PPC970_DGroup_First, PPC970_DGroup_Cracked;
+let isCommutable = 1 in
 defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
-                       "mulld", "$rT, $rA, $rB", IntMulHD,
+                       "mulld", "$rT, $rA, $rB", IIC_IntMulHD,
                        [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
-                       "mulli $rD, $rA, $imm", IntMulLI,
+                       "mulli $rD, $rA, $imm", IIC_IntMulLI,
                        [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
 }
 
@@ -560,7 +601,7 @@ let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
 defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
                         (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
-                        "rldimi", "$rA, $rS, $SH, $MBE", IntRotateDI,
+                        "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64, RegConstraint<"$rSi = $rA">,
                         NoEncode<"$rSi">;
 }
@@ -568,43 +609,53 @@ defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
 // Rotate instructions.
 defm RLDCL  : MDSForm_1r<30, 8,
                         (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
-                        "rldcl", "$rA, $rS, $rB, $MBE", IntRotateD,
+                        "rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
                         []>, isPPC64;
 defm RLDCR  : MDSForm_1r<30, 9,
                         (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
-                        "rldcr", "$rA, $rS, $rB, $MBE", IntRotateD,
+                        "rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
                         []>, isPPC64;
 defm RLDICL : MDForm_1r<30, 0,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
-                        "rldicl", "$rA, $rS, $SH, $MBE", IntRotateDI,
+                        "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64;
 // For fast-isel:
 let isCodeGenOnly = 1 in
 def RLDICL_32_64 : MDForm_1<30, 0,
                            (outs g8rc:$rA),
                            (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
-                           "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
+                           "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                            []>, isPPC64;
 // End fast-isel.
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
-                        "rldicr", "$rA, $rS, $SH, $MBE", IntRotateDI,
+                        "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64;
 defm RLDIC  : MDForm_1r<30, 2,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
-                        "rldic", "$rA, $rS, $SH, $MBE", IntRotateDI,
+                        "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64;
 
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
                         (ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
-                        "rlwinm", "$rA, $rS, $SH, $MB, $ME", IntGeneral,
+                        "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
                         []>;
 
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
+                        (ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB,
+                        u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+                        IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+                        RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+}
+
 let isSelect = 1 in
 def ISEL8   : AForm_4<31, 15,
                      (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond),
-                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     "isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
                      []>;
 }  // Interpretation64Bit
 }  // neverHasSideEffects = 1
@@ -618,111 +669,111 @@ def ISEL8   : AForm_4<31, 15,
 
 // Sign extending loads.
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
-                  "lha $rD, $src", LdStLHA,
+                  "lha $rD, $src", IIC_LdStLHA,
                   [(set i64:$rD, (sextloadi16 iaddr:$src))]>,
                   PPC970_DGroup_Cracked;
 def LWA  : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
-                    "lwa $rD, $src", LdStLWA,
+                    "lwa $rD, $src", IIC_LdStLWA,
                     [(set i64:$rD,
                           (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
                     PPC970_DGroup_Cracked;
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
-                   "lhax $rD, $src", LdStLHA,
+                   "lhax $rD, $src", IIC_LdStLHA,
                    [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
                    PPC970_DGroup_Cracked;
 def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
-                   "lwax $rD, $src", LdStLHA,
+                   "lwax $rD, $src", IIC_LdStLHA,
                    [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
                    PPC970_DGroup_Cracked;
 // For fast-isel:
 let isCodeGenOnly = 1, mayLoad = 1 in {
 def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
-                      "lwa $rD, $src", LdStLWA, []>, isPPC64,
+                      "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
                       PPC970_DGroup_Cracked;
 def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
-                     "lwax $rD, $src", LdStLHA, []>, isPPC64,
+                     "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
                      PPC970_DGroup_Cracked;
 } // end fast-isel isCodeGenOnly
 
 // Update forms.
 let mayLoad = 1, neverHasSideEffects = 1 in {
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                     (ins memri:$addr),
-                    "lhau $rD, $addr", LdStLHAU,
+                    "lhau $rD, $addr", IIC_LdStLHAU,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                     (ins memrr:$addr),
-                    "lhaux $rD, $addr", LdStLHAU,
+                    "lhaux $rD, $addr", IIC_LdStLHAUX,
                     []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                     (ins memrr:$addr),
-                    "lwaux $rD, $addr", LdStLHAU,
+                    "lwaux $rD, $addr", IIC_LdStLHAUX,
                     []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                     NoEncode<"$ea_result">, isPPC64;
 }
 }
 
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 // Zero extending loads.
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src),
-                  "lbz $rD, $src", LdStLoad,
+                  "lbz $rD, $src", IIC_LdStLoad,
                   [(set i64:$rD, (zextloadi8 iaddr:$src))]>;
 def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src),
-                  "lhz $rD, $src", LdStLoad,
+                  "lhz $rD, $src", IIC_LdStLoad,
                   [(set i64:$rD, (zextloadi16 iaddr:$src))]>;
 def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
-                  "lwz $rD, $src", LdStLoad,
+                  "lwz $rD, $src", IIC_LdStLoad,
                   [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
 
 def LBZX8 : XForm_1<31,  87, (outs g8rc:$rD), (ins memrr:$src),
-                   "lbzx $rD, $src", LdStLoad,
+                   "lbzx $rD, $src", IIC_LdStLoad,
                    [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
 def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
-                   "lhzx $rD, $src", LdStLoad,
+                   "lhzx $rD, $src", IIC_LdStLoad,
                    [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
 def LWZX8 : XForm_1<31,  23, (outs g8rc:$rD), (ins memrr:$src),
-                   "lwzx $rD, $src", LdStLoad,
+                   "lwzx $rD, $src", IIC_LdStLoad,
                    [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
                    
                    
 // Update forms.
 let mayLoad = 1, neverHasSideEffects = 1 in {
 def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                    "lbzu $rD, $addr", LdStLoadUpd,
+                    "lbzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                    "lhzu $rD, $addr", LdStLoadUpd,
+                    "lhzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                    "lwzu $rD, $addr", LdStLoadUpd,
+                    "lwzu $rD, $addr", IIC_LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 
 def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoadUpd,
+                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoadUpd,
+                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoadUpd,
+                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -733,7 +784,7 @@ def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
 // Full 8-byte loads.
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LD   : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
-                    "ld $rD, $src", LdStLD,
+                    "ld $rD, $src", IIC_LdStLD,
                     [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
 // The following three definitions are selected for small code model only.
 // Otherwise, we need to create two instructions to form a 32-bit offset,
@@ -754,30 +805,30 @@ def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
 let hasSideEffects = 1, isCodeGenOnly = 1 in {
 let RST = 2, DS = 2 in
 def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
-                    "ld 2, 8($reg)", LdStLD,
+                    "ld 2, 8($reg)", IIC_LdStLD,
                     [(PPCload_toc i64:$reg)]>, isPPC64;
                     
 let RST = 2, DS = 10, RA = 1 in
 def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
-                    "ld 2, 40(1)", LdStLD,
+                    "ld 2, 40(1)", IIC_LdStLD,
                     [(PPCtoc_restore)]>, isPPC64;
 }
 def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
-                   "ldx $rD, $src", LdStLD,
+                   "ldx $rD, $src", IIC_LdStLD,
                    [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
 def LDBRX : XForm_1<31,  532, (outs g8rc:$rD), (ins memrr:$src),
-                   "ldbrx $rD, $src", LdStLoad,
+                   "ldbrx $rD, $src", IIC_LdStLoad,
                    [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
 def LDU  : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
-                    "ldu $rD, $addr", LdStLDU,
+                    "ldu $rD, $addr", IIC_LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
 def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "ldux $rD, $addr", LdStLDU,
+                   "ldux $rD, $addr", IIC_LdStLDUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">, isPPC64;
 }
@@ -860,78 +911,79 @@ def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                   isPPC64;
 
 let PPC970_Unit = 2 in {
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 // Truncating stores.                       
 def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src),
-                   "stb $rS, $src", LdStStore,
+                   "stb $rS, $src", IIC_LdStStore,
                    [(truncstorei8 i64:$rS, iaddr:$src)]>;
 def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
-                   "sth $rS, $src", LdStStore,
+                   "sth $rS, $src", IIC_LdStStore,
                    [(truncstorei16 i64:$rS, iaddr:$src)]>;
 def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
-                   "stw $rS, $src", LdStStore,
+                   "stw $rS, $src", IIC_LdStStore,
                    [(truncstorei32 i64:$rS, iaddr:$src)]>;
 def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stbx $rS, $dst", LdStStore,
+                   "stbx $rS, $dst", IIC_LdStStore,
                    [(truncstorei8 i64:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
 def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "sthx $rS, $dst", LdStStore,
+                   "sthx $rS, $dst", IIC_LdStStore,
                    [(truncstorei16 i64:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
 def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stwx $rS, $dst", LdStStore,
+                   "stwx $rS, $dst", IIC_LdStStore,
                    [(truncstorei32 i64:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
 } // Interpretation64Bit
 
 // Normal 8-byte stores.
 def STD  : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
-                    "std $rS, $dst", LdStSTD,
+                    "std $rS, $dst", IIC_LdStSTD,
                     [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
 def STDX  : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stdx $rS, $dst", LdStSTD,
+                   "stdx $rS, $dst", IIC_LdStSTD,
                    [(store i64:$rS, xaddr:$dst)]>, isPPC64,
                    PPC970_DGroup_Cracked;
 def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
-                   "stdbrx $rS, $dst", LdStStore,
+                   "stdbrx $rS, $dst", IIC_LdStStore,
                    [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
                    PPC970_DGroup_Cracked;
 }
 
 // Stores with Update (pre-inc).
 let PPC970_Unit = 2, mayStore = 1 in {
-let Interpretation64Bit = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
-                   "stbu $rS, $dst", LdStStoreUpd, []>,
+                   "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
-                   "sthu $rS, $dst", LdStStoreUpd, []>,
+                   "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
-                   "stwu $rS, $dst", LdStStoreUpd, []>,
+                   "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
-def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
-                   "stdu $rS, $dst", LdStSTDU, []>,
-                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
-                   isPPC64;
 
 def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "stbux $rS, $dst", LdStStoreUpd, []>,
+                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "sthux $rS, $dst", LdStStoreUpd, []>,
+                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "stwux $rS, $dst", LdStStoreUpd, []>,
+                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 } // Interpretation64Bit
 
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
+                   "stdu $rS, $dst", IIC_LdStSTDU, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
+                   isPPC64;
+
 def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
-                    "stdux $rS, $dst", LdStSTDU, []>,
+                    "stdux $rS, $dst", IIC_LdStSTDUX, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked, isPPC64;
 }
@@ -966,29 +1018,29 @@ def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
 let PPC970_Unit = 3, neverHasSideEffects = 1,
     Uses = [RM] in {  // FPU Operations.
 defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fcfid", "$frD, $frB", FPGeneral,
+                        "fcfid", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
 defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fctid", "$frD, $frB", FPGeneral,
+                        "fctid", "$frD, $frB", IIC_FPGeneral,
                         []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fctidz", "$frD, $frB", FPGeneral,
+                        "fctidz", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
 
 defm FCFIDU  : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fcfidu", "$frD, $frB", FPGeneral,
+                        "fcfidu", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
 defm FCFIDS  : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
-                        "fcfids", "$frD, $frB", FPGeneral,
+                        "fcfids", "$frD, $frB", IIC_FPGeneral,
                         [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
 defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
-                        "fcfidus", "$frD, $frB", FPGeneral,
+                        "fcfidus", "$frD, $frB", IIC_FPGeneral,
                         [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
 defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fctiduz", "$frD, $frB", FPGeneral,
+                        "fctiduz", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
 defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fctiwuz", "$frD, $frB", FPGeneral,
+                        "fctiwuz", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
 }
 
@@ -1006,6 +1058,14 @@ def : Pat<(i64 (anyext i32:$in)),
 def : Pat<(i32 (trunc i64:$in)),
           (EXTRACT_SUBREG $in, sub_32)>;
 
+// Implement the 'not' operation with the NOR instruction.
+// (we could use the default xori pattern, but nor has lower latency on some
+// cores (such as the A2)).
+def i64not : OutPatFrag<(ops node:$in),
+                        (NOR8 $in, $in)>;
+def        : Pat<(not i64:$in),
+                 (i64not $in)>;
+
 // Extending loads with i64 targets.
 def : Pat<(zextloadi1 iaddr:$src),
           (LBZ8 iaddr:$src)>;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index a55abe3..2fd4a3e 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -164,7 +164,7 @@ def vecspltisw : PatLeaf<(build_vector), [{
 // VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type.
 class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
   : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
-              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
                        [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>;
 
 // VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the
@@ -172,7 +172,7 @@ class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
 class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
                    ValueType InTy>
   : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
-              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
                        [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>;
 
 // VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two
@@ -180,14 +180,14 @@ class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
 class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
                    ValueType In1Ty, ValueType In2Ty>
   : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
-              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
                        [(set OutTy:$vD,
                          (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>;
 
 // VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type.
 class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
   : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
              [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>;
 
 // VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the
@@ -195,7 +195,7 @@ class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
 class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
                   ValueType InTy>
   : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
              [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>;
 
 // VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two
@@ -203,13 +203,13 @@ class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
 class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
                   ValueType In1Ty, ValueType In2Ty>
   : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
              [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>;
 
 // VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type.
 class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
   : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
-             !strconcat(opc, " $vD, $vB"), VecFP,
+             !strconcat(opc, " $vD, $vB"), IIC_VecFP,
              [(set v4f32:$vD, (IntID v4f32:$vB))]>;
 
 // VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the
@@ -217,7 +217,7 @@ class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
 class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
                   ValueType InTy>
   : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
-             !strconcat(opc, " $vD, $vB"), VecFP,
+             !strconcat(opc, " $vD, $vB"), IIC_VecFP,
              [(set OutTy:$vD, (IntID InTy:$vB))]>;
 
 //===----------------------------------------------------------------------===//
@@ -229,116 +229,118 @@ let Predicates = [HasAltivec] in {
 let isCodeGenOnly = 1 in {
 def DSS      : DSS_Form<822, (outs),
                         (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dss $STRM", LdStLoad /*FIXME*/, []>,
+                        "dss $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSSALL   : DSS_Form<822, (outs),
                         (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dssall", LdStLoad /*FIXME*/, []>,
+                        "dssall", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DST      : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSTT     : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSTST    : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSTSTT   : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 
 def DST64    : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSTT64   : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSTST64  : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 def DSTSTT64 : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
                         Deprecated<DeprecatedDST>;
 }
 
 def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
-                      "mfvscr $vD", LdStStore,
+                      "mfvscr $vD", IIC_LdStStore,
                       [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; 
 def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
-                      "mtvscr $vB", LdStLoad,
+                      "mtvscr $vB", IIC_LdStLoad,
                       [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
 
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {  // Loads.
 def LVEBX: XForm_1<31,   7, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvebx $vD, $src", LdStLoad,
+                   "lvebx $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
 def LVEHX: XForm_1<31,  39, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvehx $vD, $src", LdStLoad,
+                   "lvehx $vD, $src", IIC_LdStLoad,
                    [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
 def LVEWX: XForm_1<31,  71, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvewx $vD, $src", LdStLoad,
+                   "lvewx $vD, $src", IIC_LdStLoad,
                    [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
 def LVX  : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvx $vD, $src", LdStLoad,
+                   "lvx $vD, $src", IIC_LdStLoad,
                    [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
 def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvxl $vD, $src", LdStLoad,
+                   "lvxl $vD, $src", IIC_LdStLoad,
                    [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
 }
 
 def LVSL : XForm_1<31,   6, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvsl $vD, $src", LdStLoad,
+                   "lvsl $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 def LVSR : XForm_1<31,  38, (outs vrrc:$vD), (ins memrr:$src),
-                   "lvsr $vD, $src", LdStLoad,
+                   "lvsr $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 
 let PPC970_Unit = 2 in {   // Stores.
 def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
-                   "stvebx $rS, $dst", LdStStore,
+                   "stvebx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
 def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
-                   "stvehx $rS, $dst", LdStStore,
+                   "stvehx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
 def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
-                   "stvewx $rS, $dst", LdStStore,
+                   "stvewx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
 def STVX  : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
-                   "stvx $rS, $dst", LdStStore,
+                   "stvx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
 def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
-                   "stvxl $rS, $dst", LdStStore,
+                   "stvxl $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
 }
 
 let PPC970_Unit = 5 in {  // VALU Operations.
 // VA-Form instructions.  3-input AltiVec ops.
+let isCommutable = 1 in {
 def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
-                       "vmaddfp $vD, $vA, $vC, $vB", VecFP,
+                       "vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP,
                        [(set v4f32:$vD,
                         (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>;
 
 // FIXME: The fma+fneg pattern won't match because fneg is not legal.
 def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
-                       "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
+                       "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
                        [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
-                                                  (fneg v4f32:$vB))))]>; 
+                                                  (fneg v4f32:$vB))))]>;
 
 def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
 def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
                              v8i16>;
 def VMLADDUHM  : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
+} // isCommutable
 
 def VPERM      : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
                               v4i32, v4i32, v16i8>;
@@ -346,23 +348,24 @@ def VSEL       : VA1a_Int_Ty<42, "vsel",  int_ppc_altivec_vsel, v4i32>;
 
 // Shuffles.
 def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
-                       "vsldoi $vD, $vA, $vB, $SH", VecFP,
+                       "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
                        [(set v16i8:$vD, 
                          (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
 
 // VX-Form instructions.  AltiVec arithmetic ops.
+let isCommutable = 1 in {
 def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vaddfp $vD, $vA, $vB", VecFP,
+                      "vaddfp $vD, $vA, $vB", IIC_VecFP,
                       [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;
                       
 def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vaddubm $vD, $vA, $vB", VecGeneral,
+                      "vaddubm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
 def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vadduhm $vD, $vA, $vB", VecGeneral,
+                      "vadduhm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>;
 def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vadduwm $vD, $vA, $vB", VecGeneral,
+                      "vadduwm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;
                       
 def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
@@ -372,30 +375,31 @@ def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>;
 def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>;
 def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>;
 def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>;
-                             
-                             
+} // isCommutable
+
+let isCommutable = 1 in
 def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vand $vD, $vA, $vB", VecFP,
+                    "vand $vD, $vA, $vB", IIC_VecFP,
                     [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>;
 def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                     "vandc $vD, $vA, $vB", VecFP,
+                     "vandc $vD, $vA, $vB", IIC_VecFP,
                      [(set v4i32:$vD, (and v4i32:$vA,
                                            (vnot_ppc v4i32:$vB)))]>;
 
 def VCFSX  : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vcfsx $vD, $vB, $UIMM", VecFP,
+                      "vcfsx $vD, $vB, $UIMM", IIC_VecFP,
                       [(set v4f32:$vD,
                              (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>;
 def VCFUX  : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vcfux $vD, $vB, $UIMM", VecFP,
+                      "vcfux $vD, $vB, $UIMM", IIC_VecFP,
                       [(set v4f32:$vD,
                              (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>;
 def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vctsxs $vD, $vB, $UIMM", VecFP,
+                      "vctsxs $vD, $vB, $UIMM", IIC_VecFP,
                       [(set v4i32:$vD,
                              (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>;
 def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vctuxs $vD, $vB, $UIMM", VecFP,
+                      "vctuxs $vD, $vB, $UIMM", IIC_VecFP,
                       [(set v4i32:$vD,
                              (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>;
 
@@ -404,25 +408,26 @@ def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
 // to floating-point (sint_to_fp/uint_to_fp) conversions.
 let isCodeGenOnly = 1, VA = 0 in {
 def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB),
-                       "vcfsx $vD, $vB, 0", VecFP,
+                       "vcfsx $vD, $vB, 0", IIC_VecFP,
                        [(set v4f32:$vD,
                              (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>;
 def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB),
-                        "vctuxs $vD, $vB, 0", VecFP,
+                        "vctuxs $vD, $vB, 0", IIC_VecFP,
                         [(set v4i32:$vD,
                                (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>;
 def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB),
-                       "vcfux $vD, $vB, 0", VecFP,
+                       "vcfux $vD, $vB, 0", IIC_VecFP,
                        [(set v4f32:$vD,
                                (int_ppc_altivec_vcfux v4i32:$vB, 0))]>;
 def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB),
-                      "vctsxs $vD, $vB, 0", VecFP,
+                      "vctsxs $vD, $vB, 0", IIC_VecFP,
                       [(set v4i32:$vD,
                              (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>;
 }
 def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>;
 def VLOGEFP  : VX2_Int_SP<458, "vlogefp",  int_ppc_altivec_vlogefp>;
 
+let isCommutable = 1 in {
 def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>;
 def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>;
 def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>;
@@ -444,24 +449,25 @@ def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>;
 def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>;
 def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>;
 def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>;
+} // isCommutable
 
 def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vmrghb $vD, $vA, $vB", VecFP,
+                      "vmrghb $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vmrghh $vD, $vA, $vB", VecFP,
+                      "vmrghh $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vmrghw $vD, $vA, $vB", VecFP,
+                      "vmrghw $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vmrglb $vD, $vA, $vB", VecFP,
+                      "vmrglb $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vmrglh $vD, $vA, $vB", VecFP,
+                      "vmrglh $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vmrglw $vD, $vA, $vB", VecFP,
+                      "vmrglw $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>;
 
 def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
@@ -477,6 +483,7 @@ def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
 def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
                             v4i32, v8i16, v4i32>;
 
+let isCommutable = 1 in {
 def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
                           v8i16, v16i8>;
 def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh,
@@ -493,6 +500,7 @@ def VMULOUB : VX1_Int_Ty2<  8, "vmuloub", int_ppc_altivec_vmuloub,
                           v8i16, v16i8>;
 def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
                           v4i32, v8i16>;
+} // isCommutable
                        
 def VREFP     : VX2_Int_SP<266, "vrefp",     int_ppc_altivec_vrefp>;
 def VRFIM     : VX2_Int_SP<714, "vrfim",     int_ppc_altivec_vrfim>;
@@ -504,16 +512,16 @@ def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
 def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>;
 
 def VSUBFP  : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vsubfp $vD, $vA, $vB", VecGeneral,
+                      "vsubfp $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>;
 def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vsububm $vD, $vA, $vB", VecGeneral,
+                      "vsububm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>;
 def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vsubuhm $vD, $vA, $vB", VecGeneral,
+                      "vsubuhm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>;
 def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vsubuwm $vD, $vA, $vB", VecGeneral,
+                      "vsubuwm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;
                       
 def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
@@ -534,15 +542,17 @@ def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
                           v4i32, v16i8, v4i32>;
 
 def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vnor $vD, $vA, $vB", VecFP,
+                    "vnor $vD, $vA, $vB", IIC_VecFP,
                     [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
                                                    v4i32:$vB)))]>;
+let isCommutable = 1 in {
 def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vor $vD, $vA, $vB", VecFP,
+                      "vor $vD, $vA, $vB", IIC_VecFP,
                       [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>;
 def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                      "vxor $vD, $vA, $vB", VecFP,
+                      "vxor $vD, $vA, $vB", IIC_VecFP,
                       [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>;
+} // isCommutable
 
 def VRLB   : VX1_Int_Ty<   4, "vrlb", int_ppc_altivec_vrlb, v16i8>;
 def VRLH   : VX1_Int_Ty<  68, "vrlh", int_ppc_altivec_vrlh, v8i16>;
@@ -556,15 +566,15 @@ def VSLH   : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>;
 def VSLW   : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>;
 
 def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vspltb $vD, $vB, $UIMM", VecPerm,
+                      "vspltb $vD, $vB, $UIMM", IIC_VecPerm,
                       [(set v16i8:$vD,
                         (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>;
 def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vsplth $vD, $vB, $UIMM", VecPerm,
+                      "vsplth $vD, $vB, $UIMM", IIC_VecPerm,
                       [(set v16i8:$vD,
                         (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
 def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
-                      "vspltw $vD, $vB, $UIMM", VecPerm,
+                      "vspltw $vD, $vB, $UIMM", IIC_VecPerm,
                       [(set v16i8:$vD, 
                         (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
 
@@ -580,13 +590,13 @@ def VSRW   : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>;
 
 
 def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM),
-                       "vspltisb $vD, $SIMM", VecPerm,
+                       "vspltisb $vD, $SIMM", IIC_VecPerm,
                        [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>;
 def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM),
-                       "vspltish $vD, $SIMM", VecPerm,
+                       "vspltish $vD, $SIMM", IIC_VecPerm,
                        [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>;
 def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
-                       "vspltisw $vD, $SIMM", VecPerm,
+                       "vspltisw $vD, $SIMM", IIC_VecPerm,
                        [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>;
 
 // Vector Pack.
@@ -601,13 +611,13 @@ def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
 def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
                           v8i16, v4i32>;
 def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                       "vpkuhum $vD, $vA, $vB", VecFP,
+                       "vpkuhum $vD, $vA, $vB", IIC_VecFP,
                        [(set v16i8:$vD,
                          (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
                           v16i8, v8i16>;
 def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                       "vpkuwum $vD, $vA, $vB", VecFP,
+                       "vpkuwum $vD, $vA, $vB", IIC_VecFP,
                        [(set v16i8:$vD,
                          (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
 def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
@@ -631,10 +641,12 @@ def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh,
 // Altivec Comparisons.
 
 class VCMP<bits<10> xo, string asmstr, ValueType Ty>
-  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),asmstr,VecFPCompare,
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare,
               [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
 class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
-  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),asmstr,VecFPCompare,
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare,
               [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
   let Defs = [CR6];
   let RC = 1;
@@ -676,24 +688,24 @@ def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
                       
 let isCodeGenOnly = 1 in {
 def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
-                      "vxor $vD, $vD, $vD", VecFP,
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
                       [(set v16i8:$vD, (v16i8 immAllZerosV))]>;
 def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
-                      "vxor $vD, $vD, $vD", VecFP,
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
                       [(set v8i16:$vD, (v8i16 immAllZerosV))]>;
 def V_SET0  : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
-                      "vxor $vD, $vD, $vD", VecFP,
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
                       [(set v4i32:$vD, (v4i32 immAllZerosV))]>;
 
 let IMM=-1 in {
 def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins),
-                      "vspltisw $vD, -1", VecFP,
+                      "vspltisw $vD, -1", IIC_VecFP,
                       [(set v16i8:$vD, (v16i8 immAllOnesV))]>;
 def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins),
-                      "vspltisw $vD, -1", VecFP,
+                      "vspltisw $vD, -1", IIC_VecFP,
                       [(set v8i16:$vD, (v8i16 immAllOnesV))]>;
 def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
-                      "vspltisw $vD, -1", VecFP,
+                      "vspltisw $vD, -1", IIC_VecFP,
                       [(set v4i32:$vD, (v4i32 immAllOnesV))]>;
 }
 }
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 29233d4..7fed2c6 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -14,6 +14,8 @@
 class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
         : Instruction {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+  let Size = 4;
 
   bit PPC64 = 0;  // Default value, override with isPPC64
 
@@ -67,6 +69,8 @@ class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
          InstrItinClass itin>
         : Instruction {
   field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  let Size = 8;
 
   bit PPC64 = 0;  // Default value, override with isPPC64
 
@@ -109,7 +113,7 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
 
 // 1.7.2 B-Form
 class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
-  : I<opcode, OOL, IOL, asmstr, BrB> {
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
   bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
   bits<3>  CR;
   bits<14> BD;
@@ -135,7 +139,7 @@ class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
 
 class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
               dag OOL, dag IOL, string asmstr>
-  : I<opcode, OOL, IOL, asmstr, BrB> {
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
   bits<14> BD;
 
   let Inst{6-10}  = bo;
@@ -147,7 +151,7 @@ class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
 
 class BForm_3<bits<6> opcode, bit aa, bit lk,
               dag OOL, dag IOL, string asmstr>
-  : I<opcode, OOL, IOL, asmstr, BrB> {
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
   bits<5> BO;
   bits<5> BI;
   bits<14> BD;
@@ -159,6 +163,19 @@ class BForm_3<bits<6> opcode, bit aa, bit lk,
   let Inst{31}    = lk;
 }
 
+class BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-10}  = bo;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
 // 1.7.3 SC-Form
 class SCForm<bits<6> opcode, bits<1> xo,
                      dag OOL, dag IOL, string asmstr, InstrItinClass itin,
@@ -258,6 +275,15 @@ class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Addr = 0;
 }
 
+class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL,
+                            string asmstr, InstrItinClass itin,
+                            list<dag> pattern>
+  : DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = R;
+  let B = R;
+  let C = 0; 
+}
+
 class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
             dag OOL, dag IOL, string asmstr,
             InstrItinClass itin, list<dag> pattern>
@@ -567,6 +593,173 @@ class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let A = 0;
 }
 
+// XX*-Form (VSX)
+class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = XT{5};
+}
+
+class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = 0;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> CR;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = CR;
+  let Inst{9-15}  = 0;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+  bits<2> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-13} = 0;
+  let Inst{14-15} = D;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> CR;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = CR;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<2> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21}    = 0;
+  let Inst{22-23} = D;
+  let Inst{24-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21}    = RC;
+  let Inst{22-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-25} = XC{4-0};
+  let Inst{26-27} = xo;
+  let Inst{28}    = XC{5};
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
 // DCB_Form - Form X instruction, used for dcb* instructions.
 class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
                       InstrItinClass itin, list<dag> pattern>
@@ -664,6 +857,12 @@ class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
   let BH = 0;
 }
 
+class XLForm_2_br2<bits<6> opcode, bits<10> xo, bits<5> bo, bit lk,
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BH = 0;
+}
 
 class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
                   dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 315ad04..939bbdc 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -18,16 +18,19 @@
 #include "PPCInstrBuilder.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,6 +48,13 @@ opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
 static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
 cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
+static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
+cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
+
+static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
+cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
+cl::Hidden);
+
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -61,7 +71,7 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
   if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
       Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
-    return new PPCScoreboardHazardRecognizer(II, DAG);
+    return new ScoreboardHazardRecognizer(II, DAG);
   }
 
   return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
@@ -74,6 +84,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   const ScheduleDAG *DAG) const {
   unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
+  if (Directive == PPC::DIR_PWR7)
+    return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
+
   // Most subtargets use a PPC970 recognizer.
   if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
       Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
@@ -82,7 +95,57 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
     return new PPCHazardRecognizer970(TM);
   }
 
-  return new PPCScoreboardHazardRecognizer(II, DAG);
+  return new ScoreboardHazardRecognizer(II, DAG);
+}
+
+
+int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *UseMI,
+                                    unsigned UseIdx) const {
+  int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
+                                                   UseMI, UseIdx);
+
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  bool IsRegCR;
+  if (TRI->isVirtualRegister(Reg)) {
+    const MachineRegisterInfo *MRI =
+      &DefMI->getParent()->getParent()->getRegInfo();
+    IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
+              MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
+  } else {
+    IsRegCR = PPC::CRRCRegClass.contains(Reg) ||
+              PPC::CRBITRCRegClass.contains(Reg);
+  }
+
+  if (UseMI->isBranch() && IsRegCR) {
+    if (Latency < 0)
+      Latency = getInstrLatency(ItinData, DefMI);
+
+    // On some cores, there is an additional delay between writing to a condition
+    // register, and using it from a branch.
+    unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+    switch (Directive) {
+    default: break;
+    case PPC::DIR_7400:
+    case PPC::DIR_750:
+    case PPC::DIR_970:
+    case PPC::DIR_E5500:
+    case PPC::DIR_PWR4:
+    case PPC::DIR_PWR5:
+    case PPC::DIR_PWR5X:
+    case PPC::DIR_PWR6:
+    case PPC::DIR_PWR6X:
+    case PPC::DIR_PWR7:
+      Latency += 2;
+      break;
+    }
+  }
+
+  return Latency;
 }
 
 // Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
@@ -110,7 +173,9 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case PPC::LFS:
   case PPC::LFD:
   case PPC::RESTORE_CR:
+  case PPC::RESTORE_CRBIT:
   case PPC::LVX:
+  case PPC::LXVD2X:
   case PPC::RESTORE_VRSAVE:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
@@ -134,7 +199,9 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case PPC::STFS:
   case PPC::STFD:
   case PPC::SPILL_CR:
+  case PPC::SPILL_CRBIT:
   case PPC::STVX:
+  case PPC::STXVD2X:
   case PPC::SPILL_VRSAVE:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
@@ -156,7 +223,9 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 
   // Normal instructions can be commuted the obvious way.
   if (MI->getOpcode() != PPC::RLWIMI &&
-      MI->getOpcode() != PPC::RLWIMIo)
+      MI->getOpcode() != PPC::RLWIMIo &&
+      MI->getOpcode() != PPC::RLWIMI8 &&
+      MI->getOpcode() != PPC::RLWIMI8o)
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
 
   // Cannot commute if it has a non-zero rotate count.
@@ -174,6 +243,8 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   unsigned Reg0 = MI->getOperand(0).getReg();
   unsigned Reg1 = MI->getOperand(1).getReg();
   unsigned Reg2 = MI->getOperand(2).getReg();
+  unsigned SubReg1 = MI->getOperand(1).getSubReg();
+  unsigned SubReg2 = MI->getOperand(2).getSubReg();
   bool Reg1IsKill = MI->getOperand(1).isKill();
   bool Reg2IsKill = MI->getOperand(2).isKill();
   bool ChangeReg0 = false;
@@ -183,6 +254,7 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     // Must be two address instruction!
     assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
            "Expecting a two-address instruction!");
+    assert(MI->getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
     Reg2IsKill = false;
     ChangeReg0 = true;
   }
@@ -203,10 +275,14 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
       .addImm((MB-1) & 31);
   }
 
-  if (ChangeReg0)
+  if (ChangeReg0) {
     MI->getOperand(0).setReg(Reg2);
+    MI->getOperand(0).setSubReg(SubReg2);
+  }
   MI->getOperand(2).setReg(Reg1);
   MI->getOperand(1).setReg(Reg2);
+  MI->getOperand(2).setSubReg(SubReg1);
+  MI->getOperand(1).setSubReg(SubReg2);
   MI->getOperand(2).setIsKill(Reg1IsKill);
   MI->getOperand(1).setIsKill(Reg2IsKill);
 
@@ -216,13 +292,37 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   return MI;
 }
 
+bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  // For VSX A-Type FMA instructions, it is the first two operands that can be
+  // commuted, however, because the non-encoded tied input operand is listed
+  // first, the operands to swap are actually the second and third.
+
+  int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+  if (AltOpc == -1)
+    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+
+  SrcOpIdx1 = 2;
+  SrcOpIdx2 = 3;
+  return true;
+}
+
 void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI) const {
+  // This function is used for scheduling, and the nop wanted here is the type
+  // that terminates dispatch groups on the POWER cores.
+  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  unsigned Opcode;
+  switch (Directive) {
+  default:            Opcode = PPC::NOP; break;
+  case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
+  case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
+  }
+
   DebugLoc DL;
-  BuildMI(MBB, MI, DL, get(PPC::NOP));
+  BuildMI(MBB, MI, DL, get(Opcode));
 }
 
-
 // Branch analysis.
 // Note: If the condition register is set to CTR or CTR8 then this is a
 // BDNZ (imm == 1) or BDZ (imm == 0) branch.
@@ -263,6 +363,22 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       Cond.push_back(LastInst->getOperand(0));
       Cond.push_back(LastInst->getOperand(1));
       return false;
+    } else if (LastInst->getOpcode() == PPC::BC) {
+      if (!LastInst->getOperand(1).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCn) {
+      if (!LastInst->getOperand(1).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
     } else if (LastInst->getOpcode() == PPC::BDNZ8 ||
                LastInst->getOpcode() == PPC::BDNZ) {
       if (!LastInst->getOperand(0).isMBB())
@@ -310,6 +426,26 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     Cond.push_back(SecondLastInst->getOperand(1));
     FBB = LastInst->getOperand(0).getMBB();
     return false;
+  } else if (SecondLastInst->getOpcode() == PPC::BC &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(1).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if (SecondLastInst->getOpcode() == PPC::BCn &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(1).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
   } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 ||
               SecondLastInst->getOpcode() == PPC::BDNZ) &&
       LastInst->getOpcode() == PPC::B) {
@@ -367,6 +503,7 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     --I;
   }
   if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
       I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
       I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
     return 0;
@@ -379,6 +516,7 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   if (I == MBB.begin()) return 1;
   --I;
   if (I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
       I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
       I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
     return 1;
@@ -408,9 +546,13 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
       BuildMI(&MBB, DL, get(Cond[0].getImm() ?
                               (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                               (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+    else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+      BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+    else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+      BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
     else                // Conditional branch
       BuildMI(&MBB, DL, get(PPC::BCC))
-        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+        .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
     return 1;
   }
 
@@ -419,9 +561,13 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     BuildMI(&MBB, DL, get(Cond[0].getImm() ?
                             (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                             (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+  else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+    BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+  else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+    BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
   else
     BuildMI(&MBB, DL, get(PPC::BCC))
-      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+      .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
   BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
   return 2;
 }
@@ -506,6 +652,8 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break;
   case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break;
   case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break;
+  case PPC::PRED_BIT_SET:   SubIdx = 0; SwapOps = false; break;
+  case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
   }
 
   unsigned FirstReg =  SwapOps ? FalseReg : TrueReg,
@@ -534,6 +682,47 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I, DebugLoc DL,
                                unsigned DestReg, unsigned SrcReg,
                                bool KillSrc) const {
+  // We can end up with self copies and similar things as a result of VSX copy
+  // legalization. Promote them here.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (PPC::F8RCRegClass.contains(DestReg) &&
+      PPC::VSLRCRegClass.contains(SrcReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && SrcReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    DestReg = SuperReg;
+  } else if (PPC::VRRCRegClass.contains(DestReg) &&
+             PPC::VSHRCRegClass.contains(SrcReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(DestReg, PPC::sub_128, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && SrcReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    DestReg = SuperReg;
+  } else if (PPC::F8RCRegClass.contains(SrcReg) &&
+             PPC::VSLRCRegClass.contains(DestReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && DestReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    SrcReg = SuperReg;
+  } else if (PPC::VRRCRegClass.contains(SrcReg) &&
+             PPC::VSHRCRegClass.contains(DestReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(SrcReg, PPC::sub_128, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && DestReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    SrcReg = SuperReg;
+  }
+
   unsigned Opc;
   if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::OR;
@@ -545,6 +734,18 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = PPC::MCRF;
   else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::VOR;
+  else if (PPC::VSRCRegClass.contains(DestReg, SrcReg))
+    // There are two different ways this can be done:
+    //   1. xxlor : This has lower latency (on the P7), 2 cycles, but can only
+    //      issue in VSU pipeline 0.
+    //   2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but
+    //      can go to either pipeline.
+    // We'll always use xxlor here, because in practically all cases where
+    // copies are generated, they are close enough to some use that the
+    // lower-latency form is preferable.
+    Opc = PPC::XXLOR;
+  else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::XXLORf;
   else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
     Opc = PPC::CROR;
   else
@@ -570,12 +771,14 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
   // update isStoreToStackSlot.
 
   DebugLoc DL;
-  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
@@ -597,45 +800,29 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                        FrameIdx));
     return true;
   } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
-    // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
-    // backend currently only uses CR1EQ as an individual bit, this should
-    // not cause any bug. If we need other uses of CR bits, the following
-    // code may be invalid.
-    unsigned Reg = 0;
-    if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
-        SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
-      Reg = PPC::CR0;
-    else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
-             SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
-      Reg = PPC::CR1;
-    else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
-             SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
-      Reg = PPC::CR2;
-    else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
-             SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
-      Reg = PPC::CR3;
-    else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
-             SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
-      Reg = PPC::CR4;
-    else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
-             SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
-      Reg = PPC::CR5;
-    else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
-             SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
-      Reg = PPC::CR6;
-    else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
-             SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
-      Reg = PPC::CR7;
-
-    return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
-                               &PPC::CRRCRegClass, NewMIs, NonRI, SpillsVRS);
-
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
   } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
     NonRI = true;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXVD2X))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXSDX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
   } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
     assert(TM.getSubtargetImpl()->isDarwin() &&
            "VRSAVE only needs spill/restore on Darwin");
@@ -695,10 +882,12 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
   // Note: If additional load instructions are added here,
   // update isLoadFromStackSlot.
 
-  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
                                                DestReg), FrameIdx));
-  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
                                        FrameIdx));
   } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
@@ -713,40 +902,22 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                        FrameIdx));
     return true;
   } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
-
-    unsigned Reg = 0;
-    if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
-        DestReg == PPC::CR0EQ || DestReg == PPC::CR0UN)
-      Reg = PPC::CR0;
-    else if (DestReg == PPC::CR1LT || DestReg == PPC::CR1GT ||
-             DestReg == PPC::CR1EQ || DestReg == PPC::CR1UN)
-      Reg = PPC::CR1;
-    else if (DestReg == PPC::CR2LT || DestReg == PPC::CR2GT ||
-             DestReg == PPC::CR2EQ || DestReg == PPC::CR2UN)
-      Reg = PPC::CR2;
-    else if (DestReg == PPC::CR3LT || DestReg == PPC::CR3GT ||
-             DestReg == PPC::CR3EQ || DestReg == PPC::CR3UN)
-      Reg = PPC::CR3;
-    else if (DestReg == PPC::CR4LT || DestReg == PPC::CR4GT ||
-             DestReg == PPC::CR4EQ || DestReg == PPC::CR4UN)
-      Reg = PPC::CR4;
-    else if (DestReg == PPC::CR5LT || DestReg == PPC::CR5GT ||
-             DestReg == PPC::CR5EQ || DestReg == PPC::CR5UN)
-      Reg = PPC::CR5;
-    else if (DestReg == PPC::CR6LT || DestReg == PPC::CR6GT ||
-             DestReg == PPC::CR6EQ || DestReg == PPC::CR6UN)
-      Reg = PPC::CR6;
-    else if (DestReg == PPC::CR7LT || DestReg == PPC::CR7GT ||
-             DestReg == PPC::CR7EQ || DestReg == PPC::CR7UN)
-      Reg = PPC::CR7;
-
-    return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
-                                &PPC::CRRCRegClass, NewMIs, NonRI, SpillsVRS);
-
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CRBIT), DestReg),
+                                       FrameIdx));
+    return true;
   } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
                                        FrameIdx));
     NonRI = true;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXVD2X), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXSDX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
   } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
     assert(TM.getSubtargetImpl()->isDarwin() &&
            "VRSAVE only needs spill/restore on Darwin");
@@ -933,9 +1104,17 @@ bool PPCInstrInfo::PredicateInstruction(
       MI->setDesc(get(Pred[0].getImm() ?
                       (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) :
                       (isPPC64 ? PPC::BDZLR8  : PPC::BDZLR)));
-    } else {
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MI->setDesc(get(PPC::BCLR));
       MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MI->setDesc(get(PPC::BCLRn));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+    } else {
+      MI->setDesc(get(PPC::BCCLR));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
         .addImm(Pred[0].getImm())
         .addReg(Pred[1].getReg());
     }
@@ -947,6 +1126,22 @@ bool PPCInstrInfo::PredicateInstruction(
       MI->setDesc(get(Pred[0].getImm() ?
                       (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                       (isPPC64 ? PPC::BDZ8  : PPC::BDZ)));
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+      MI->RemoveOperand(0);
+
+      MI->setDesc(get(PPC::BC));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg())
+        .addMBB(MBB);
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+      MI->RemoveOperand(0);
+
+      MI->setDesc(get(PPC::BCn));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg())
+        .addMBB(MBB);
     } else {
       MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
       MI->RemoveOperand(0);
@@ -966,8 +1161,23 @@ bool PPCInstrInfo::PredicateInstruction(
 
     bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
     bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
-    MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) :
-                              (setLR ? PPC::BCCTRL  : PPC::BCCTR)));
+
+    if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) :
+                                (setLR ? PPC::BCCTRL  : PPC::BCCTR)));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+      return true;
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) :
+                                (setLR ? PPC::BCCTRLn  : PPC::BCCTRn)));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addReg(Pred[1].getReg());
+      return true;
+    }
+
+    MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) :
+                              (setLR ? PPC::BCCCTRL  : PPC::BCCCTR)));
     MachineInstrBuilder(*MI->getParent()->getParent(), MI)
       .addImm(Pred[0].getImm())
       .addReg(Pred[1].getReg());
@@ -1152,8 +1362,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   if (equalityOnly) {
     // We need to check the uses of the condition register in order to reject
     // non-equality comparisons.
-    for (MachineRegisterInfo::use_iterator I = MRI->use_begin(CRReg),
-         IE = MRI->use_end(); I != IE; ++I) {
+    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
+         IE = MRI->use_instr_end(); I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
@@ -1175,8 +1385,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end();
        I != EL; ++I) {
     bool FoundUse = false;
-    for (MachineRegisterInfo::use_iterator J = MRI->use_begin(CRReg),
-         JE = MRI->use_end(); J != JE; ++J)
+    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
+         JE = MRI->use_instr_end(); J != JE; ++J)
       if (&*J == &*I) {
         FoundUse = true;
         break;
@@ -1285,15 +1495,16 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   }
 
   if (ShouldSwap)
-    for (MachineRegisterInfo::use_iterator I = MRI->use_begin(CRReg),
-         IE = MRI->use_end(); I != IE; ++I) {
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
         assert((!equalityOnly ||
                 Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) &&
                "Invalid predicate for equality-only optimization");
-        PredsToUpdate.push_back(std::make_pair(&((*I).getOperand(0)),
+        PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
                                 PPC::getSwappedPredicate(Pred)));
       } else if (UseMI->getOpcode() == PPC::ISEL ||
                  UseMI->getOpcode() == PPC::ISEL8) {
@@ -1306,7 +1517,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
         else if (NewSubReg == PPC::sub_gt)
           NewSubReg = PPC::sub_lt;
 
-        SubRegsToUpdate.push_back(std::make_pair(&((*I).getOperand(3)),
+        SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)),
                                                  NewSubReg));
       } else // We need to abort on a user we don't understand.
         return false;
@@ -1318,7 +1529,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   CmpInstr->eraseFromParent();
 
   MachineBasicBlock::iterator MII = MI;
-  BuildMI(*MI->getParent(), llvm::next(MII), MI->getDebugLoc(),
+  BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(),
           get(TargetOpcode::COPY), CRReg)
     .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);
 
@@ -1363,26 +1574,497 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
 /// instruction may be.  This returns the maximum number of bytes.
 ///
 unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  case PPC::INLINEASM: {       // Inline Asm: Variable size.
+  unsigned Opcode = MI->getOpcode();
+
+  if (Opcode == PPC::INLINEASM) {
     const MachineFunction *MF = MI->getParent()->getParent();
     const char *AsmStr = MI->getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
-  }
-  case PPC::PROLOG_LABEL:
-  case PPC::EH_LABEL:
-  case PPC::GC_LABEL:
-  case PPC::DBG_VALUE:
-    return 0;
-  case PPC::BL8_NOP:
-  case PPC::BLA8_NOP:
-    return 8;
-  default:
-    return 4; // PowerPC instructions are all 4 bytes
+  } else {
+    const MCInstrDesc &Desc = get(Opcode);
+    return Desc.getSize();
   }
 }
 
 #undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-fma-mutate"
+
+namespace {
+  // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXFMAMutate : public MachineFunctionPass {
+    static char ID;
+    PPCVSXFMAMutate() : MachineFunctionPass(ID) {
+      initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    }
+
+    LiveIntervals *LIS;
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+
+        // The default (A-type) VSX FMA form kills the addend (it is taken from
+        // the target register, which is then updated to reflect the result of
+        // the FMA). If the instruction, however, kills one of the registers
+        // used for the product, then we can use the M-form instruction (which
+        // will take that value from the to-be-defined register).
+
+        int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
+        if (AltOpc == -1)
+          continue;
+
+        // This pass is run after register coalescing, and so we're looking for
+        // a situation like this:
+        //   ...
+        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   ...
+        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
+        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+        //   ...
+        // Where we can eliminate the copy by changing from the A-type to the
+        // M-type instruction. Specifically, for this example, this means:
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        // is replaced by:
+        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
+        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
+        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+
+        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+
+        VNInfo *AddendValNo =
+          LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
+        MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
+
+        // The addend and this instruction must be in the same block.
+
+        if (!AddendMI || AddendMI->getParent() != MI->getParent())
+          continue;
+
+        // The addend must be a full copy within the same register class.
+
+        if (!AddendMI->isFullCopy())
+          continue;
+
+        unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+          if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
+              MRI.getRegClass(AddendSrcReg))
+            continue;
+        } else {
+          // If AddendSrcReg is a physical register, make sure the destination
+          // register class contains it.
+          if (!MRI.getRegClass(AddendMI->getOperand(0).getReg())
+                ->contains(AddendSrcReg))
+            continue;
+        }
+
+        // In theory, there could be other uses of the addend copy before this
+        // fma.  We could deal with this, but that would require additional
+        // logic below and I suspect it will not occur in any relevant
+        // situations.
+        bool OtherUsers = false;
+        for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
+             J != JE; --J)
+          if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
+            OtherUsers = true;
+            break;
+          }
+
+        if (OtherUsers)
+          continue;
+
+        // Find one of the product operands that is killed by this instruction.
+
+        unsigned KilledProdOp = 0, OtherProdOp = 0;
+        if (LIS->getInterval(MI->getOperand(2).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 2;
+          OtherProdOp  = 3;
+        } else if (LIS->getInterval(MI->getOperand(3).getReg())
+                     .Query(FMAIdx).isKill()) {
+          KilledProdOp = 3;
+          OtherProdOp  = 2;
+        }
+
+        // If there are no killed product operands, then this transformation is
+        // likely not profitable.
+        if (!KilledProdOp)
+          continue;
+
+        // In order to replace the addend here with the source of the copy,
+        // it must still be live here.
+        if (!LIS->getInterval(AddendMI->getOperand(1).getReg()).liveAt(FMAIdx))
+          continue;
+
+        // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
+
+        unsigned AddReg = AddendMI->getOperand(1).getReg();
+        unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
+        unsigned OtherProdReg  = MI->getOperand(OtherProdOp).getReg();
+
+        unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
+        unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg();
+        unsigned OtherProdSubReg  = MI->getOperand(OtherProdOp).getSubReg();
+
+        bool AddRegKill = AddendMI->getOperand(1).isKill();
+        bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill();
+        bool OtherProdRegKill  = MI->getOperand(OtherProdOp).isKill();
+
+        bool AddRegUndef = AddendMI->getOperand(1).isUndef();
+        bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
+        bool OtherProdRegUndef  = MI->getOperand(OtherProdOp).isUndef();
+
+        unsigned OldFMAReg = MI->getOperand(0).getReg();
+
+        assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
+               "Addend copy not tied to old FMA output!");
+
+        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << *MI;);
+
+        MI->getOperand(0).setReg(KilledProdReg);
+        MI->getOperand(1).setReg(KilledProdReg);
+        MI->getOperand(3).setReg(AddReg);
+        MI->getOperand(2).setReg(OtherProdReg);
+
+        MI->getOperand(0).setSubReg(KilledProdSubReg);
+        MI->getOperand(1).setSubReg(KilledProdSubReg);
+        MI->getOperand(3).setSubReg(AddSubReg);
+        MI->getOperand(2).setSubReg(OtherProdSubReg);
+
+        MI->getOperand(1).setIsKill(KilledProdRegKill);
+        MI->getOperand(3).setIsKill(AddRegKill);
+        MI->getOperand(2).setIsKill(OtherProdRegKill);
+
+        MI->getOperand(1).setIsUndef(KilledProdRegUndef);
+        MI->getOperand(3).setIsUndef(AddRegUndef);
+        MI->getOperand(2).setIsUndef(OtherProdRegUndef);
+
+        MI->setDesc(TII->get(AltOpc));
+
+        DEBUG(dbgs() << " -> " << *MI);
+
+        // The killed product operand was killed here, so we can reuse it now
+        // for the result of the fma.
+
+        LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
+        VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
+        for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
+             UI != UE;) {
+          MachineOperand &UseMO = *UI;
+          MachineInstr *UseMI = UseMO.getParent();
+          ++UI;
+
+          // Don't replace the result register of the copy we're about to erase.
+          if (UseMI == AddendMI)
+            continue;
+
+          UseMO.setReg(KilledProdReg);
+          UseMO.setSubReg(KilledProdSubReg);
+        }
+
+        // Extend the live intervals of the killed product operand to hold the
+        // fma result.
+
+        LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
+        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
+             AI != AE; ++AI) {
+          // Don't add the segment that corresponds to the original copy.
+          if (AI->valno == AddendValNo)
+            continue;
+
+          VNInfo *NewFMAValNo =
+            NewFMAInt.getNextValue(AI->start,
+                                   LIS->getVNInfoAllocator());
+
+          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
+                                                     NewFMAValNo));
+        }
+        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
+
+        FMAInt.removeValNo(FMAValNo);
+        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
+
+        // Remove the (now unused) copy.
+
+        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
+        LIS->RemoveMachineInstrFromMaps(AddendMI);
+        AddendMI->eraseFromParent();
+
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      LIS = &getAnalysis<LiveIntervals>();
+
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      if (DisableVSXFMAMutate)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
+                      "PowerPC VSX FMA Mutation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
+                    "PowerPC VSX FMA Mutation", false, false)
+
+char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
+
+char PPCVSXFMAMutate::ID = 0;
+FunctionPass*
+llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace llvm {
+  void initializePPCVSXCopyPass(PassRegistry&);
+}
+
+namespace {
+  // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXCopy : public MachineFunctionPass {
+    static char ID;
+    PPCVSXCopy() : MachineFunctionPass(ID) {
+      initializePPCVSXCopyPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+    bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+                      MachineRegisterInfo &MRI) {
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        return RC->hasSubClassEq(MRI.getRegClass(Reg));
+      } else if (RC->contains(Reg)) {
+        return true;
+      }
+
+      return false;
+    }
+
+    bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+    }
+
+    bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+    }
+
+    bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+    }
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+        if (!MI->isFullCopy())
+          continue;
+
+        MachineOperand &DstMO = MI->getOperand(0);
+        MachineOperand &SrcMO = MI->getOperand(1);
+
+        if ( IsVSReg(DstMO.getReg(), MRI) &&
+            !IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *to* a VSX register from a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *SrcRC =
+            IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
+                                           &PPC::VSLRCRegClass;
+          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
+                  IsVRReg(SrcMO.getReg(), MRI)) &&
+                 "Unknown source for a VSX copy");
+
+          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+          BuildMI(MBB, MI, MI->getDebugLoc(),
+                  TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+            .addImm(1) // add 1, not 0, because there is no implicit clearing
+                       // of the high bits.
+            .addOperand(SrcMO)
+            .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128 :
+                                                   PPC::sub_64);
+
+          // The source of the original copy is now the new virtual register.
+          SrcMO.setReg(NewVReg);
+        } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+                    IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *from* a VSX register to a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *DstRC =
+            IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
+                                           &PPC::VSLRCRegClass;
+          assert((IsF8Reg(DstMO.getReg(), MRI) ||
+                  IsVRReg(DstMO.getReg(), MRI)) &&
+                 "Unknown destination for a VSX copy");
+
+          // Copy the VSX value into a new VSX register of the correct subclass.
+          unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+          BuildMI(MBB, MI, MI->getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), NewVReg)
+            .addOperand(SrcMO);
+
+          // Transform the original copy into a subregister extraction copy.
+          SrcMO.setReg(NewVReg);
+          SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 :
+                                                         PPC::sub_64);
+        }
+      }
+
+      return Changed;
+    }
+
+public:
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
+                "PowerPC VSX Copy Legalization", false, false)
+
+char PPCVSXCopy::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-vsx-copy-cleanup"
+
+namespace llvm {
+  void initializePPCVSXCopyCleanupPass(PassRegistry&);
+}
+
+namespace {
+  // PPCVSXCopyCleanup pass - We sometimes end up generating self copies of VSX
+  // registers (mostly because the ABI code still places all values into the
+  // "traditional" floating-point and vector registers). Remove them here.
+  struct PPCVSXCopyCleanup : public MachineFunctionPass {
+    static char ID;
+    PPCVSXCopyCleanup() : MachineFunctionPass(ID) {
+      initializePPCVSXCopyCleanupPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCTargetMachine *TM;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      SmallVector<MachineInstr *, 4> ToDelete;
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr *MI = I;
+        if (MI->getOpcode() == PPC::XXLOR &&
+            MI->getOperand(0).getReg() == MI->getOperand(1).getReg() &&
+            MI->getOperand(0).getReg() == MI->getOperand(2).getReg())
+          ToDelete.push_back(MI);
+      }
+
+      if (!ToDelete.empty())
+        Changed = true;
+
+      for (unsigned i = 0, ie = ToDelete.size(); i != ie; ++i) {
+        DEBUG(dbgs() << "Removing VSX self-copy: " << *ToDelete[i]);
+        ToDelete[i]->eraseFromParent();
+      }
+
+      return Changed;
+    }
+
+public:
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      TII = TM->getInstrInfo();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCVSXCopyCleanup, DEBUG_TYPE,
+                "PowerPC VSX Copy Cleanup", false, false)
+
+char PPCVSXCopyCleanup::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyCleanupPass() { return new PPCVSXCopyCleanup(); }
+
+#undef DEBUG_TYPE
 #define DEBUG_TYPE "ppc-early-ret"
 STATISTIC(NumBCLR, "Number of early conditional returns");
 STATISTIC(NumBLR,  "Number of early returns");
@@ -1424,7 +2106,7 @@ protected:
           if (J->getOpcode() == PPC::B) {
             if (J->getOperand(0).getMBB() == &ReturnMBB) {
               // This is an unconditional branch to the return. Replace the
-	      // branch with a blr.
+              // branch with a blr.
               BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BLR));
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
@@ -1436,7 +2118,7 @@ protected:
             if (J->getOperand(2).getMBB() == &ReturnMBB) {
               // This is a conditional branch to the return. Replace the branch
               // with a bclr.
-              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCLR))
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
                 .addImm(J->getOperand(0).getImm())
                 .addReg(J->getOperand(1).getReg());
               MachineBasicBlock::iterator K = J--;
@@ -1445,6 +2127,20 @@ protected:
               ++NumBCLR;
               continue;
             }
+          } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) {
+            if (J->getOperand(1).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              BuildMI(**PI, J, J->getDebugLoc(),
+                      TII->get(J->getOpcode() == PPC::BC ?
+                               PPC::BCLR : PPC::BCLRn))
+                .addReg(J->getOperand(0).getReg());
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
           } else if (J->isBranch()) {
             if (J->isIndirectBranch()) {
               if (ReturnMBB.hasAddressTaken())
@@ -1466,7 +2162,7 @@ protected:
         if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
           OtherReference = true;
 
-	// Predecessors are stored in a vector and can't be removed here.
+        // Predecessors are stored in a vector and can't be removed here.
         if (!OtherReference && BlockChanged) {
           PredToRemove.push_back(*PI);
         }
@@ -1509,7 +2205,7 @@ public:
         return Changed;
 
       for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++; 
+        MachineBasicBlock &B = *I++;
         if (processBlock(B))
           Changed = true;
       }
@@ -1529,4 +2225,3 @@ INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
 char PPCEarlyReturn::ID = 0;
 FunctionPass*
 llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
-
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index f140c41..3c8117c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -95,6 +95,18 @@ public:
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                      const ScheduleDAG *DAG) const;
 
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI, unsigned UseIdx) const;
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const {
+    return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
+                                              UseNode, UseIdx);
+  }
+
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
                              unsigned &SubIdx) const;
@@ -107,6 +119,9 @@ public:
   // rotate amt is zero.  We also have to munge the immediates a bit.
   virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
 
+  virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                                     unsigned &SrcOpIdx2) const;
+
   virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 2bd3aad..1d984ab 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -99,6 +99,8 @@ def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
 def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
 def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
 
+def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
+
 def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
 def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
                             [SDNPMayLoad]>;
@@ -288,6 +290,12 @@ def imm16ShiftedSExt : PatLeaf<(imm), [{
   return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
 }], HI16>;
 
+def imm64ZExt32  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit
+  // zero extended field.
+  return isUInt<32>(Imm);
+}]>;
+
 // Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
 // restricted memrix (4-aligned) constants are alignment sensitive. If these
 // offsets are hidden behind TOC entries than the values of the lower-order
@@ -404,6 +412,14 @@ def crrc : RegisterOperand<CRRC> {
   let ParserMatchClass = PPCRegCRRCAsmOperand;
 }
 
+def PPCU2ImmAsmOperand : AsmOperandClass {
+  let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u2imm   : Operand<i32> {
+  let PrintMethod = "printU2ImmOperand";
+  let ParserMatchClass = PPCU2ImmAsmOperand;
+}
 def PPCS5ImmAsmOperand : AsmOperandClass {
   let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
   let RenderMethod = "addImmOperands";
@@ -411,6 +427,7 @@ def PPCS5ImmAsmOperand : AsmOperandClass {
 def s5imm   : Operand<i32> {
   let PrintMethod = "printS5ImmOperand";
   let ParserMatchClass = PPCS5ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<5>";
 }
 def PPCU5ImmAsmOperand : AsmOperandClass {
   let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
@@ -419,6 +436,7 @@ def PPCU5ImmAsmOperand : AsmOperandClass {
 def u5imm   : Operand<i32> {
   let PrintMethod = "printU5ImmOperand";
   let ParserMatchClass = PPCU5ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<5>";
 }
 def PPCU6ImmAsmOperand : AsmOperandClass {
   let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
@@ -427,6 +445,7 @@ def PPCU6ImmAsmOperand : AsmOperandClass {
 def u6imm   : Operand<i32> {
   let PrintMethod = "printU6ImmOperand";
   let ParserMatchClass = PPCU6ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<6>";
 }
 def PPCS16ImmAsmOperand : AsmOperandClass {
   let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
@@ -436,6 +455,7 @@ def s16imm  : Operand<i32> {
   let PrintMethod = "printS16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
 }
 def PPCU16ImmAsmOperand : AsmOperandClass {
   let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
@@ -445,6 +465,7 @@ def u16imm  : Operand<i32> {
   let PrintMethod = "printU16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
 }
 def PPCS17ImmAsmOperand : AsmOperandClass {
   let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
@@ -457,6 +478,7 @@ def s17imm  : Operand<i32> {
   let PrintMethod = "printS16ImmOperand";
   let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
 }
 def PPCDirectBrAsmOperand : AsmOperandClass {
   let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
@@ -502,6 +524,7 @@ def PPCCRBitMaskOperand : AsmOperandClass {
 def crbitm: Operand<i8> {
   let PrintMethod = "printcrbitm";
   let EncoderMethod = "get_crbitm_encoding";
+  let DecoderMethod = "decodeCRBitMOperand";
   let ParserMatchClass = PPCCRBitMaskOperand;
 }
 // Address operands
@@ -539,6 +562,7 @@ def memri : Operand<iPTR> {
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIEncoding";
+  let DecoderMethod = "decodeMemRIOperands";
 }
 def memrr : Operand<iPTR> {
   let PrintMethod = "printMemRegReg";
@@ -548,6 +572,7 @@ def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
   let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIXEncoding";
+  let DecoderMethod = "decodeMemRIXOperands";
 }
 
 // A single-register address. This is used with the SjLj
@@ -555,6 +580,14 @@ def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
 def memr : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc:$ptrreg);
 }
+def PPCTLSRegOperand : AsmOperandClass {
+  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
+  let RenderMethod = "addTLSRegOperands";
+}
+def tlsreg32 : Operand<i32> {
+  let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
+}
 
 // PowerPC Predicate operand.
 def pred : Operand<OtherVT> {
@@ -580,6 +613,7 @@ def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
 def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
 def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
+def IsNotBookE  : Predicate<"!PPCSubTarget.isBookE()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -613,20 +647,6 @@ multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   }
 }
 
-multiclass XForm_10r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-                    string asmbase, string asmstr, InstrItinClass itin,
-                    list<dag> pattern> {
-  let BaseName = asmbase in {
-    def NAME : XForm_10<opcode, xo, OOL, IOL,
-                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
-                       pattern>, RecFormRel;
-    let Defs = [CR0] in
-    def o    : XForm_10<opcode, xo, OOL, IOL,
-                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
-                       []>, isDOT, RecFormRel;
-  }
-}
-
 multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
                       string asmbase, string asmstr, InstrItinClass itin,
                       list<dag> pattern> {
@@ -887,30 +907,63 @@ let usesCustomInserter = 1,    // Expanded after instruction selection.
   def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
                               i32imm:$BROPC), "#SELECT_CC_VRRC",
                               []>;
+
+  // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+  // register bit directly.
+  def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
+                          gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
+                          [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
+  def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
+                          g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
+                          [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
+  def SELECT_F4  : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
+                          f4rc:$T, f4rc:$F), "#SELECT_F4",
+                          [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
+  def SELECT_F8  : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
+                          f8rc:$T, f8rc:$F), "#SELECT_F8",
+                          [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+  def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+                          vrrc:$T, vrrc:$F), "#SELECT_VRRC",
+                          [(set v4i32:$dst,
+                                (select i1:$cond, v4i32:$T, v4i32:$F))]>;
 }
 
 // SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
 // scavenge a register for it.
-let mayStore = 1 in
+let mayStore = 1 in {
 def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
                      "#SPILL_CR", []>;
+def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
+                         "#SPILL_CRBIT", []>;
+}
 
 // RESTORE_CR - Indicate that we're restoring the CR register (previously
 // spilled), so we'll need to scavenge a register for it.
-let mayLoad = 1 in
+let mayLoad = 1 in {
 def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
                      "#RESTORE_CR", []>;
+def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
+                           "#RESTORE_CRBIT", []>;
+}
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
   let isReturn = 1, Uses = [LR, RM] in
-    def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", BrB,
+    def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
                            [(retflag)]>;
   let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
-    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>;
+    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                            []>;
 
-    let isCodeGenOnly = 1 in
-    def BCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
-                            "b${cond:cc}ctr${cond:pm} ${cond:reg}", BrB, []>;
+    let isCodeGenOnly = 1 in {
+      def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+                               "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+                               []>;
+
+      def BCCTR :  XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+                                "bcctr 12, $bi, 0", IIC_BrB, []>;
+      def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+                                "bcctr 4, $bi, 0", IIC_BrB, []>;
+    }
   }
 }
 
@@ -921,10 +974,10 @@ let Defs = [LR] in
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   let isBarrier = 1 in {
   def B   : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
-                  "b $dst", BrB,
+                  "b $dst", IIC_BrB,
                   [(br bb:$dst)]>;
   def BA  : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
-                  "ba $dst", BrB, []>;
+                  "ba $dst", IIC_BrB, []>;
   }
 
   // BCC represents an arbitrary conditional branch on a predicate.
@@ -938,23 +991,39 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
                      "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
 
     let isReturn = 1, Uses = [LR, RM] in
-    def BCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
-                           "b${cond:cc}lr${cond:pm} ${cond:reg}", BrB, []>;
+    def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
+                           "b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>;
+  }
+
+  let isCodeGenOnly = 1 in {
+    let Pattern = [(brcond i1:$bi, bb:$dst)] in
+    def BC  : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+             "bc 12, $bi, $dst">;
+
+    let Pattern = [(brcond (not i1:$bi), bb:$dst)] in
+    def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+             "bc 4, $bi, $dst">;
+
+    let isReturn = 1, Uses = [LR, RM] in
+    def BCLR  : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
+                             "bclr 12, $bi, 0", IIC_BrB, []>;
+    def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
+                             "bclr 4, $bi, 0", IIC_BrB, []>;
   }
 
   let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
    def BDZLR  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
-                             "bdzlr", BrB, []>;
+                             "bdzlr", IIC_BrB, []>;
    def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
-                             "bdnzlr", BrB, []>;
+                             "bdnzlr", IIC_BrB, []>;
    def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
-                             "bdzlr+", BrB, []>;
+                             "bdzlr+", IIC_BrB, []>;
    def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
-                             "bdnzlr+", BrB, []>;
+                             "bdnzlr+", IIC_BrB, []>;
    def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
-                             "bdzlr-", BrB, []>;
+                             "bdzlr-", IIC_BrB, []>;
    def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
-                             "bdnzlr-", BrB, []>;
+                             "bdnzlr-", IIC_BrB, []>;
   }
 
   let Defs = [CTR], Uses = [CTR] in {
@@ -997,33 +1066,54 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
-                    "bl $func", BrB, []>;  // See Pat patterns below.
+                    "bl $func", IIC_BrB, []>;  // See Pat patterns below.
     def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
-                    "bla $func", BrB, [(PPCcall (i32 imm:$func))]>;
+                    "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;
 
     let isCodeGenOnly = 1 in {
       def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
                        "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
       def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
                         "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;
+
+      def BCL  : BForm_4<16, 12, 0, 1, (outs),
+                         (ins crbitrc:$bi, condbrtarget:$dst),
+                         "bcl 12, $bi, $dst">;
+      def BCLn : BForm_4<16, 4, 0, 1, (outs),
+                         (ins crbitrc:$bi, condbrtarget:$dst),
+                         "bcl 4, $bi, $dst">;
     }
   }
   let Uses = [CTR, RM] in {
     def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
-                             "bctrl", BrB, [(PPCbctrl)]>,
+                             "bctrl", IIC_BrB, [(PPCbctrl)]>,
                 Requires<[In32BitMode]>;
 
-    let isCodeGenOnly = 1 in
-    def BCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
-                             "b${cond:cc}ctrl${cond:pm} ${cond:reg}", BrB, []>;
+    let isCodeGenOnly = 1 in {
+      def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+                                "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+                                []>;
+
+      def BCCTRL  : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+                                 "bcctrl 12, $bi, 0", IIC_BrB, []>;
+      def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+                                 "bcctrl 4, $bi, 0", IIC_BrB, []>;
+    }
   }
   let Uses = [LR, RM] in {
     def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
-                            "blrl", BrB, []>;
+                            "blrl", IIC_BrB, []>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
+                              "b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB,
+                              []>;
 
-    let isCodeGenOnly = 1 in
-    def BCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
-                            "b${cond:cc}lrl${cond:pm} ${cond:reg}", BrB, []>;
+      def BCLRL  : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi),
+                                "bclrl 12, $bi, 0", IIC_BrB, []>;
+      def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi),
+                                "bclrl 4, $bi, 0", IIC_BrB, []>;
+    }
   }
   let Defs = [CTR], Uses = [CTR, RM] in {
     def BDZL  : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
@@ -1053,17 +1143,17 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   }
   let Defs = [CTR], Uses = [CTR, LR, RM] in {
     def BDZLRL  : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
-                               "bdzlrl", BrB, []>;
+                               "bdzlrl", IIC_BrB, []>;
     def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
-                               "bdnzlrl", BrB, []>;
+                               "bdnzlrl", IIC_BrB, []>;
     def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
-                               "bdzlrl+", BrB, []>;
+                               "bdzlrl+", IIC_BrB, []>;
     def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
-                               "bdnzlrl+", BrB, []>;
+                               "bdnzlrl+", IIC_BrB, []>;
     def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
-                               "bdzlrl-", BrB, []>;
+                               "bdzlrl-", IIC_BrB, []>;
     def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
-                               "bdnzlrl-", BrB, []>;
+                               "bdnzlrl-", IIC_BrB, []>;
   }
 }
 
@@ -1089,19 +1179,19 @@ let isCodeGenOnly = 1 in {
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
     isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
-def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
-     Requires<[In32BitMode]>;
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                            []>, Requires<[In32BitMode]>;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
 def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
-                  "b $dst", BrB,
+                  "b $dst", IIC_BrB,
                   []>;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
 def TAILBA   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
-                  "ba $dst", BrB,
+                  "ba $dst", IIC_BrB,
                   []>;
 
 }
@@ -1127,33 +1217,33 @@ let isBranch = 1, isTerminator = 1 in {
 // System call.
 let PPC970_Unit = 7 in {
   def SC     : SCForm<17, 1, (outs), (ins i32imm:$lev),
-                      "sc $lev", BrB, [(PPCsc (i32 imm:$lev))]>;
+                      "sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>;
 }
 
 // DCB* instructions.
-def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst),
-                      "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBF   : DCB_Form<86, 0, (outs), (ins memrr:$dst),
-                      "dcbf $dst", LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
+def DCBF   : DCB_Form<86, 0, (outs), (ins memrr:$dst), "dcbf $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst),
-                      "dcbi $dst", LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst),
-                      "dcbst $dst", LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBT   : DCB_Form<278, 0, (outs), (ins memrr:$dst),
-                      "dcbt $dst", LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
+def DCBT   : DCB_Form<278, 0, (outs), (ins memrr:$dst), "dcbt $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst),
-                      "dcbtst $dst", LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
+def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst), "dcbtst $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst),
-                      "dcbz $dst", LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
-def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst),
-                      "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
 
 def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
@@ -1241,26 +1331,26 @@ let usesCustomInserter = 1 in {
 
 // Instructions to support atomic operations
 def LWARX : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
-                   "lwarx $rD, $src", LdStLWARX,
+                   "lwarx $rD, $src", IIC_LdStLWARX,
                    [(set i32:$rD, (PPClarx xoaddr:$src))]>;
 
 let Defs = [CR0] in
 def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
-                   "stwcx. $rS, $dst", LdStSTWCX,
+                   "stwcx. $rS, $dst", IIC_LdStSTWCX,
                    [(PPCstcx i32:$rS, xoaddr:$dst)]>,
                    isDOT;
 
 let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
-def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStLoad, [(trap)]>;
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;
 
 def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
-                     "twi $to, $rA, $imm", IntTrapW, []>;
+                     "twi $to, $rA, $imm", IIC_IntTrapW, []>;
 def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
-                 "tw $to, $rA, $rB", IntTrapW, []>;
+                 "tw $to, $rA, $rB", IIC_IntTrapW, []>;
 def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
-                     "tdi $to, $rA, $imm", IntTrapD, []>;
+                     "tdi $to, $rA, $imm", IIC_IntTrapD, []>;
 def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
-                 "td $to, $rA, $rB", IntTrapD, []>;
+                 "td $to, $rA, $rB", IIC_IntTrapD, []>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Load Instructions.
@@ -1269,56 +1359,56 @@ def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
 // Unindexed (r+i) Loads. 
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
-                  "lbz $rD, $src", LdStLoad,
+                  "lbz $rD, $src", IIC_LdStLoad,
                   [(set i32:$rD, (zextloadi8 iaddr:$src))]>;
 def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
-                  "lha $rD, $src", LdStLHA,
+                  "lha $rD, $src", IIC_LdStLHA,
                   [(set i32:$rD, (sextloadi16 iaddr:$src))]>,
                   PPC970_DGroup_Cracked;
 def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
-                  "lhz $rD, $src", LdStLoad,
+                  "lhz $rD, $src", IIC_LdStLoad,
                   [(set i32:$rD, (zextloadi16 iaddr:$src))]>;
 def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
-                  "lwz $rD, $src", LdStLoad,
+                  "lwz $rD, $src", IIC_LdStLoad,
                   [(set i32:$rD, (load iaddr:$src))]>;
 
 def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
-                  "lfs $rD, $src", LdStLFD,
+                  "lfs $rD, $src", IIC_LdStLFD,
                   [(set f32:$rD, (load iaddr:$src))]>;
 def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
-                  "lfd $rD, $src", LdStLFD,
+                  "lfd $rD, $src", IIC_LdStLFD,
                   [(set f64:$rD, (load iaddr:$src))]>;
 
 
 // Unindexed (r+i) Loads with Update (preinc).
 let mayLoad = 1, neverHasSideEffects = 1 in {
 def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                   "lbzu $rD, $addr", LdStLoadUpd,
+                   "lbzu $rD, $addr", IIC_LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                   "lhau $rD, $addr", LdStLHAU,
+                   "lhau $rD, $addr", IIC_LdStLHAU,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                   "lhzu $rD, $addr", LdStLoadUpd,
+                   "lhzu $rD, $addr", IIC_LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                   "lwzu $rD, $addr", LdStLoadUpd,
+                   "lwzu $rD, $addr", IIC_LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                  "lfsu $rD, $addr", LdStLFDU,
+                  "lfsu $rD, $addr", IIC_LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
-                  "lfdu $rD, $addr", LdStLFDU,
+                  "lfdu $rD, $addr", IIC_LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
@@ -1326,37 +1416,37 @@ def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr
 // Indexed (r+r) Loads with Update (preinc).
 def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoadUpd,
+                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lhaux $rD, $addr", LdStLHAU,
+                   "lhaux $rD, $addr", IIC_LdStLHAUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoadUpd,
+                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoadUpd,
+                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lfsux $rD, $addr", LdStLFDU,
+                   "lfsux $rD, $addr", IIC_LdStLFDUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
                    (ins memrr:$addr),
-                   "lfdux $rD, $addr", LdStLFDU,
+                   "lfdux $rD, $addr", IIC_LdStLFDUX,
                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -1366,45 +1456,45 @@ def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
 //
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LBZX : XForm_1<31,  87, (outs gprc:$rD), (ins memrr:$src),
-                   "lbzx $rD, $src", LdStLoad,
+                   "lbzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
 def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
-                   "lhax $rD, $src", LdStLHA,
+                   "lhax $rD, $src", IIC_LdStLHA,
                    [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
                    PPC970_DGroup_Cracked;
 def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
-                   "lhzx $rD, $src", LdStLoad,
+                   "lhzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
 def LWZX : XForm_1<31,  23, (outs gprc:$rD), (ins memrr:$src),
-                   "lwzx $rD, $src", LdStLoad,
+                   "lwzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (load xaddr:$src))]>;
                    
                    
 def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
-                   "lhbrx $rD, $src", LdStLoad,
+                   "lhbrx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
 def LWBRX : XForm_1<31,  534, (outs gprc:$rD), (ins memrr:$src),
-                   "lwbrx $rD, $src", LdStLoad,
+                   "lwbrx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
-                      "lfsx $frD, $src", LdStLFD,
+                      "lfsx $frD, $src", IIC_LdStLFD,
                       [(set f32:$frD, (load xaddr:$src))]>;
 def LFDX   : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
-                      "lfdx $frD, $src", LdStLFD,
+                      "lfdx $frD, $src", IIC_LdStLFD,
                       [(set f64:$frD, (load xaddr:$src))]>;
 
 def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
-                      "lfiwax $frD, $src", LdStLFD,
+                      "lfiwax $frD, $src", IIC_LdStLFD,
                       [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
 def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
-                      "lfiwzx $frD, $src", LdStLFD,
+                      "lfiwzx $frD, $src", IIC_LdStLFD,
                       [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
 }
 
 // Load Multiple
 def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
-                  "lmw $rD, $src", LdStLMW, []>;
+                  "lmw $rD, $src", IIC_LdStLMW, []>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Store Instructions.
@@ -1413,38 +1503,38 @@ def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
 // Unindexed (r+i) Stores.
 let PPC970_Unit = 2 in {
 def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
-                   "stb $rS, $src", LdStStore,
+                   "stb $rS, $src", IIC_LdStStore,
                    [(truncstorei8 i32:$rS, iaddr:$src)]>;
 def STH  : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
-                   "sth $rS, $src", LdStStore,
+                   "sth $rS, $src", IIC_LdStStore,
                    [(truncstorei16 i32:$rS, iaddr:$src)]>;
 def STW  : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
-                   "stw $rS, $src", LdStStore,
+                   "stw $rS, $src", IIC_LdStStore,
                    [(store i32:$rS, iaddr:$src)]>;
 def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
-                   "stfs $rS, $dst", LdStSTFD,
+                   "stfs $rS, $dst", IIC_LdStSTFD,
                    [(store f32:$rS, iaddr:$dst)]>;
 def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
-                   "stfd $rS, $dst", LdStSTFD,
+                   "stfd $rS, $dst", IIC_LdStSTFD,
                    [(store f64:$rS, iaddr:$dst)]>;
 }
 
 // Unindexed (r+i) Stores with Update (preinc).
 let PPC970_Unit = 2, mayStore = 1 in {
 def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
-                    "stbu $rS, $dst", LdStStoreUpd, []>,
+                    "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
-                    "sthu $rS, $dst", LdStStoreUpd, []>,
+                    "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
-                    "stwu $rS, $dst", LdStStoreUpd, []>,
+                    "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
-                    "stfsu $rS, $dst", LdStSTFDU, []>,
+                    "stfsu $rS, $dst", IIC_LdStSTFDU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
-                    "stfdu $rS, $dst", LdStSTFDU, []>,
+                    "stfdu $rS, $dst", IIC_LdStSTFDU, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
 }
 
@@ -1465,59 +1555,59 @@ def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
 // Indexed (r+r) Stores.
 let PPC970_Unit = 2 in {
 def STBX  : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
-                   "stbx $rS, $dst", LdStStore,
+                   "stbx $rS, $dst", IIC_LdStStore,
                    [(truncstorei8 i32:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
 def STHX  : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
-                   "sthx $rS, $dst", LdStStore,
+                   "sthx $rS, $dst", IIC_LdStStore,
                    [(truncstorei16 i32:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
 def STWX  : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
-                   "stwx $rS, $dst", LdStStore,
+                   "stwx $rS, $dst", IIC_LdStStore,
                    [(store i32:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
  
 def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
-                   "sthbrx $rS, $dst", LdStStore,
+                   "sthbrx $rS, $dst", IIC_LdStStore,
                    [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
                    PPC970_DGroup_Cracked;
 def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
-                   "stwbrx $rS, $dst", LdStStore,
+                   "stwbrx $rS, $dst", IIC_LdStStore,
                    [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
                    PPC970_DGroup_Cracked;
 
 def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
-                     "stfiwx $frS, $dst", LdStSTFD,
+                     "stfiwx $frS, $dst", IIC_LdStSTFD,
                      [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
                      
 def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
-                     "stfsx $frS, $dst", LdStSTFD,
+                     "stfsx $frS, $dst", IIC_LdStSTFD,
                      [(store f32:$frS, xaddr:$dst)]>;
 def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
-                     "stfdx $frS, $dst", LdStSTFD,
+                     "stfdx $frS, $dst", IIC_LdStSTFD,
                      [(store f64:$frS, xaddr:$dst)]>;
 }
 
 // Indexed (r+r) Stores with Update (preinc).
 let PPC970_Unit = 2, mayStore = 1 in {
 def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
-                    "stbux $rS, $dst", LdStStoreUpd, []>,
+                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
-                    "sthux $rS, $dst", LdStStoreUpd, []>,
+                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
-                    "stwux $rS, $dst", LdStStoreUpd, []>,
+                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
-                    "stfsux $rS, $dst", LdStSTFDU, []>,
+                    "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
-                    "stfdux $rS, $dst", LdStSTFDU, []>,
+                    "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     PPC970_DGroup_Cracked;
 }
@@ -1538,11 +1628,20 @@ def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
 
 // Store Multiple
 def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
-                   "stmw $rS, $dst", LdStLMW, []>;
+                   "stmw $rS, $dst", IIC_LdStLMW, []>;
 
 def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
-                        "sync $L", LdStSync, []>;
-def : Pat<(int_ppc_sync), (SYNC 0)>;
+                        "sync $L", IIC_LdStSync, []>, Requires<[IsNotBookE]>;
+
+let isCodeGenOnly = 1 in {
+  def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
+                           "msync", IIC_LdStSync, []>, Requires<[IsBookE]> {
+    let L = 0;
+  }
+}
+
+def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[IsNotBookE]>;
+def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[IsBookE]>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Arithmetic Instructions.
@@ -1550,41 +1649,41 @@ def : Pat<(int_ppc_sync), (SYNC 0)>;
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
-                     "addi $rD, $rA, $imm", IntSimple,
+                     "addi $rD, $rA, $imm", IIC_IntSimple,
                      [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
 let BaseName = "addic" in {
 let Defs = [CARRY] in
 def ADDIC  : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
-                     "addic $rD, $rA, $imm", IntGeneral,
+                     "addic $rD, $rA, $imm", IIC_IntGeneral,
                      [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
                      RecFormRel, PPC970_DGroup_Cracked;
 let Defs = [CARRY, CR0] in
 def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
-                     "addic. $rD, $rA, $imm", IntGeneral,
+                     "addic. $rD, $rA, $imm", IIC_IntGeneral,
                      []>, isDOT, RecFormRel;
 }
 def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
-                     "addis $rD, $rA, $imm", IntSimple,
+                     "addis $rD, $rA, $imm", IIC_IntSimple,
                      [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
 let isCodeGenOnly = 1 in
 def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
-                     "la $rD, $sym($rA)", IntGeneral,
+                     "la $rD, $sym($rA)", IIC_IntGeneral,
                      [(set i32:$rD, (add i32:$rA,
                                           (PPClo tglobaladdr:$sym, 0)))]>;
 def MULLI  : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
-                     "mulli $rD, $rA, $imm", IntMulLI,
+                     "mulli $rD, $rA, $imm", IIC_IntMulLI,
                      [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
 let Defs = [CARRY] in
 def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
-                     "subfic $rD, $rA, $imm", IntGeneral,
+                     "subfic $rD, $rA, $imm", IIC_IntGeneral,
                      [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
-                       "li $rD, $imm", IntSimple,
+                       "li $rD, $imm", IIC_IntSimple,
                        [(set i32:$rD, imm32SExt16:$imm)]>;
   def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
-                       "lis $rD, $imm", IntSimple,
+                       "lis $rD, $imm", IIC_IntSimple,
                        [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
 }
 }
@@ -1592,154 +1691,170 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 let PPC970_Unit = 1 in {  // FXU Operations.
 let Defs = [CR0] in {
 def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
-                    "andi. $dst, $src1, $src2", IntGeneral,
+                    "andi. $dst, $src1, $src2", IIC_IntGeneral,
                     [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
                     isDOT;
 def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
-                    "andis. $dst, $src1, $src2", IntGeneral,
+                    "andis. $dst, $src1, $src2", IIC_IntGeneral,
                     [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
                     isDOT;
 }
 def ORI   : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
-                    "ori $dst, $src1, $src2", IntSimple,
+                    "ori $dst, $src1, $src2", IIC_IntSimple,
                     [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
 def ORIS  : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
-                    "oris $dst, $src1, $src2", IntSimple,
+                    "oris $dst, $src1, $src2", IIC_IntSimple,
                     [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
 def XORI  : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
-                    "xori $dst, $src1, $src2", IntSimple,
+                    "xori $dst, $src1, $src2", IIC_IntSimple,
                     [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
 def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
-                    "xoris $dst, $src1, $src2", IntSimple,
+                    "xoris $dst, $src1, $src2", IIC_IntSimple,
                     [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
-def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntSimple,
+
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
                          []>;
+let isCodeGenOnly = 1 in {
+// The POWER6 and POWER7 have special group-terminating nops.
+def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
+                                        "ori 1, 1, 0", IIC_IntSimple, []>;
+def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
+                                        "ori 2, 2, 0", IIC_IntSimple, []>;
+}
+
 let isCompare = 1, neverHasSideEffects = 1 in {
   def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
-                          "cmpwi $crD, $rA, $imm", IntCompare>;
+                          "cmpwi $crD, $rA, $imm", IIC_IntCompare>;
   def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
-                           "cmplwi $dst, $src1, $src2", IntCompare>;
+                           "cmplwi $dst, $src1, $src2", IIC_IntCompare>;
 }
 }
 
 let PPC970_Unit = 1, neverHasSideEffects = 1 in {  // FXU Operations.
+let isCommutable = 1 in {
 defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "nand", "$rA, $rS, $rB", IntSimple,
+                     "nand", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
 defm AND  : XForm_6r<31,  28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "and", "$rA, $rS, $rB", IntSimple,
+                     "and", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (and i32:$rS, i32:$rB))]>;
+} // isCommutable
 defm ANDC : XForm_6r<31,  60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "andc", "$rA, $rS, $rB", IntSimple,
+                     "andc", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
 defm OR   : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "or", "$rA, $rS, $rB", IntSimple,
+                     "or", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (or i32:$rS, i32:$rB))]>;
 defm NOR  : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "nor", "$rA, $rS, $rB", IntSimple,
+                     "nor", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
+} // isCommutable
 defm ORC  : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "orc", "$rA, $rS, $rB", IntSimple,
+                     "orc", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
 defm EQV  : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "eqv", "$rA, $rS, $rB", IntSimple,
+                     "eqv", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
 defm XOR  : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "xor", "$rA, $rS, $rB", IntSimple,
+                     "xor", "$rA, $rS, $rB", IIC_IntSimple,
                      [(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
+} // isCommutable
 defm SLW  : XForm_6r<31,  24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "slw", "$rA, $rS, $rB", IntGeneral,
+                     "slw", "$rA, $rS, $rB", IIC_IntGeneral,
                      [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
 defm SRW  : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                     "srw", "$rA, $rS, $rB", IntGeneral,
+                     "srw", "$rA, $rS, $rB", IIC_IntGeneral,
                      [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
 defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
-                      "sraw", "$rA, $rS, $rB", IntShift,
+                      "sraw", "$rA, $rS, $rB", IIC_IntShift,
                       [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
 }
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 let neverHasSideEffects = 1 in {
 defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
-                        "srawi", "$rA, $rS, $SH", IntShift,
+                        "srawi", "$rA, $rS, $SH", IIC_IntShift,
                         [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
 defm CNTLZW : XForm_11r<31,  26, (outs gprc:$rA), (ins gprc:$rS),
-                        "cntlzw", "$rA, $rS", IntGeneral,
+                        "cntlzw", "$rA, $rS", IIC_IntGeneral,
                         [(set i32:$rA, (ctlz i32:$rS))]>;
 defm EXTSB  : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
-                        "extsb", "$rA, $rS", IntSimple,
+                        "extsb", "$rA, $rS", IIC_IntSimple,
                         [(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
 defm EXTSH  : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
-                        "extsh", "$rA, $rS", IntSimple,
+                        "extsh", "$rA, $rS", IIC_IntSimple,
                         [(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
 }
 let isCompare = 1, neverHasSideEffects = 1 in {
   def CMPW   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
-                            "cmpw $crD, $rA, $rB", IntCompare>;
+                            "cmpw $crD, $rA, $rB", IIC_IntCompare>;
   def CMPLW  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
-                            "cmplw $crD, $rA, $rB", IntCompare>;
+                            "cmplw $crD, $rA, $rB", IIC_IntCompare>;
 }
 }
 let PPC970_Unit = 3 in {  // FPU Operations.
 //def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
-//                      "fcmpo $crD, $fA, $fB", FPCompare>;
+//                      "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
 let isCompare = 1, neverHasSideEffects = 1 in {
   def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
-                        "fcmpu $crD, $fA, $fB", FPCompare>;
+                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
-                        "fcmpu $crD, $fA, $fB", FPCompare>;
+                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
 }
 
 let Uses = [RM] in {
   let neverHasSideEffects = 1 in {
   defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fctiw", "$frD, $frB", FPGeneral,
+                          "fctiw", "$frD, $frB", IIC_FPGeneral,
                           []>;
   defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fctiwz", "$frD, $frB", FPGeneral,
+                          "fctiwz", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
 
   defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
-                          "frsp", "$frD, $frB", FPGeneral,
+                          "frsp", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (fround f64:$frB))]>;
 
-  let Interpretation64Bit = 1 in
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "frin", "$frD, $frB", FPGeneral,
+                          "frin", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (frnd f64:$frB))]>;
   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "frin", "$frD, $frB", FPGeneral,
+                          "frin", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (frnd f32:$frB))]>;
   }
 
   let neverHasSideEffects = 1 in {
-  let Interpretation64Bit = 1 in
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIPD  : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "frip", "$frD, $frB", FPGeneral,
+                          "frip", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (fceil f64:$frB))]>;
   defm FRIPS  : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "frip", "$frD, $frB", FPGeneral,
+                          "frip", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (fceil f32:$frB))]>;
-  let Interpretation64Bit = 1 in
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIZD  : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "friz", "$frD, $frB", FPGeneral,
+                          "friz", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (ftrunc f64:$frB))]>;
   defm FRIZS  : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "friz", "$frD, $frB", FPGeneral,
+                          "friz", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (ftrunc f32:$frB))]>;
-  let Interpretation64Bit = 1 in
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIMD  : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "frim", "$frD, $frB", FPGeneral,
+                          "frim", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (ffloor f64:$frB))]>;
   defm FRIMS  : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "frim", "$frD, $frB", FPGeneral,
+                          "frim", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (ffloor f32:$frB))]>;
 
   defm FSQRT  : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fsqrt", "$frD, $frB", FPSqrt,
+                          "fsqrt", "$frD, $frB", IIC_FPSqrtD,
                           [(set f64:$frD, (fsqrt f64:$frB))]>;
   defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "fsqrts", "$frD, $frB", FPSqrt,
+                          "fsqrts", "$frD, $frB", IIC_FPSqrtS,
                           [(set f32:$frD, (fsqrt f32:$frB))]>;
   }
   }
@@ -1751,54 +1866,54 @@ let Uses = [RM] in {
 /// sneak into a d-group with a store).
 let neverHasSideEffects = 1 in
 defm FMR   : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
-                       "fmr", "$frD, $frB", FPGeneral,
+                       "fmr", "$frD, $frB", IIC_FPGeneral,
                        []>,  // (set f32:$frD, f32:$frB)
                        PPC970_Unit_Pseudo;
 
 let PPC970_Unit = 3, neverHasSideEffects = 1 in {  // FPU Operations.
 // These are artificially split into two different forms, for 4/8 byte FP.
 defm FABSS  : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
-                        "fabs", "$frD, $frB", FPGeneral,
+                        "fabs", "$frD, $frB", IIC_FPGeneral,
                         [(set f32:$frD, (fabs f32:$frB))]>;
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm FABSD  : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fabs", "$frD, $frB", FPGeneral,
+                        "fabs", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (fabs f64:$frB))]>;
 defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB),
-                        "fnabs", "$frD, $frB", FPGeneral,
+                        "fnabs", "$frD, $frB", IIC_FPGeneral,
                         [(set f32:$frD, (fneg (fabs f32:$frB)))]>;
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fnabs", "$frD, $frB", FPGeneral,
+                        "fnabs", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (fneg (fabs f64:$frB)))]>;
 defm FNEGS  : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB),
-                        "fneg", "$frD, $frB", FPGeneral,
+                        "fneg", "$frD, $frB", IIC_FPGeneral,
                         [(set f32:$frD, (fneg f32:$frB))]>;
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm FNEGD  : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
-                        "fneg", "$frD, $frB", FPGeneral,
+                        "fneg", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (fneg f64:$frB))]>;
 
 defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
-                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
                         [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
-                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
 
 // Reciprocal estimates.
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "fre", "$frD, $frB", FPGeneral,
+                          "fre", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (PPCfre f64:$frB))]>;
 defm FRES     : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "fres", "$frD, $frB", FPGeneral,
+                          "fres", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (PPCfre f32:$frB))]>;
 defm FRSQRTE  : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB),
-                          "frsqrte", "$frD, $frB", FPGeneral,
+                          "frsqrte", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
 defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
-                          "frsqrtes", "$frD, $frB", FPGeneral,
+                          "frsqrtes", "$frD, $frB", IIC_FPGeneral,
                           [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
 }
 
@@ -1806,57 +1921,67 @@ defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
 //
 let neverHasSideEffects = 1 in
 def MCRF   : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
-                      "mcrf $BF, $BFA", BrMCR>,
+                      "mcrf $BF, $BFA", IIC_BrMCR>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
 
+let isCommutable = 1 in {
 def CRAND  : XLForm_1<19, 257, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "crand $CRD, $CRA, $CRB", BrCR, []>;
+                      "crand $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>;
 
 def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "crnand $CRD, $CRA, $CRB", BrCR, []>;
+                      "crnand $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>;
 
 def CROR   : XLForm_1<19, 449, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "cror $CRD, $CRA, $CRB", BrCR, []>;
+                      "cror $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>;
 
 def CRXOR  : XLForm_1<19, 193, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "crxor $CRD, $CRA, $CRB", BrCR, []>;
+                      "crxor $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>;
 
 def CRNOR  : XLForm_1<19, 33, (outs crbitrc:$CRD),
                               (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "crnor $CRD, $CRA, $CRB", BrCR, []>;
+                      "crnor $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>;
 
 def CREQV  : XLForm_1<19, 289, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "creqv $CRD, $CRA, $CRB", BrCR, []>;
+                      "creqv $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>;
+} // isCommutable
 
 def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "crandc $CRD, $CRA, $CRB", BrCR, []>;
+                      "crandc $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>;
 
 def CRORC  : XLForm_1<19, 417, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "crorc $CRD, $CRA, $CRB", BrCR, []>;
+                      "crorc $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;
 
 let isCodeGenOnly = 1 in {
 def CRSET  : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
-              "creqv $dst, $dst, $dst", BrCR,
-              []>;
+              "creqv $dst, $dst, $dst", IIC_BrCR,
+              [(set i1:$dst, 1)]>;
 
 def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
-              "crxor $dst, $dst, $dst", BrCR,
-              []>;
+              "crxor $dst, $dst, $dst", IIC_BrCR,
+              [(set i1:$dst, 0)]>;
 
 let Defs = [CR1EQ], CRD = 6 in {
 def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
-              "creqv 6, 6, 6", BrCR,
+              "creqv 6, 6, 6", IIC_BrCR,
               [(PPCcr6set)]>;
 
 def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
-              "crxor 6, 6, 6", BrCR,
+              "crxor 6, 6, 6", IIC_BrCR,
               [(PPCcr6unset)]>;
 }
 }
@@ -1865,38 +1990,38 @@ def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
 //
 
 def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
-                      "mfspr $RT, $SPR", SprMFSPR>;
+                      "mfspr $RT, $SPR", IIC_SprMFSPR>;
 def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
-                      "mtspr $SPR, $RT", SprMTSPR>;
+                      "mtspr $SPR, $RT", IIC_SprMTSPR>;
 
 def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
-                     "mftb $RT, $SPR", SprMFTB>, Deprecated<DeprecatedMFTB>;
+                     "mftb $RT, $SPR", IIC_SprMFTB>, Deprecated<DeprecatedMFTB>;
 
 let Uses = [CTR] in {
 def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
-                          "mfctr $rT", SprMFSPR>,
+                          "mfctr $rT", IIC_SprMFSPR>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
 def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
-                          "mtctr $rS", SprMTSPR>,
+                          "mtctr $rS", IIC_SprMTSPR>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
 let Pattern = [(int_ppc_mtctr i32:$rS)] in
 def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
-                              "mtctr $rS", SprMTSPR>,
+                              "mtctr $rS", IIC_SprMTSPR>,
                 PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
 let Defs = [LR] in {
 def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
-                          "mtlr $rS", SprMTSPR>,
+                          "mtlr $rS", IIC_SprMTSPR>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 let Uses = [LR] in {
 def MFLR  : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
-                          "mflr $rT", SprMFSPR>,
+                          "mflr $rT", IIC_SprMFSPR>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
@@ -1905,19 +2030,19 @@ let isCodeGenOnly = 1 in {
   // like a GPR on the PPC970.  As such, copies in and out have the same
   // performance characteristics as an OR instruction.
   def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
-                               "mtspr 256, $rS", IntGeneral>,
+                               "mtspr 256, $rS", IIC_IntGeneral>,
                  PPC970_DGroup_Single, PPC970_Unit_FXU;
   def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
-                               "mfspr $rT, 256", IntGeneral>,
+                               "mfspr $rT, 256", IIC_IntGeneral>,
                  PPC970_DGroup_First, PPC970_Unit_FXU;
 
   def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
                                 (outs VRSAVERC:$reg), (ins gprc:$rS),
-                                "mtspr 256, $rS", IntGeneral>,
+                                "mtspr 256, $rS", IIC_IntGeneral>,
                   PPC970_DGroup_Single, PPC970_Unit_FXU;
   def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT),
                                 (ins VRSAVERC:$reg),
-                                "mfspr $rT, 256", IntGeneral>,
+                                "mfspr $rT, 256", IIC_IntGeneral>,
                   PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
@@ -1935,20 +2060,20 @@ def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
 
 let neverHasSideEffects = 1 in {
 def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
-                       "mtocrf $FXM, $ST", BrMCRX>,
+                       "mtocrf $FXM, $ST", IIC_BrMCRX>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
 def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
-                      "mtcrf $FXM, $rS", BrMCRX>,
+                      "mtcrf $FXM, $rS", IIC_BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
 let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
-                       "mfocrf $rT, $FXM", SprMFCR>,
+                       "mfocrf $rT, $FXM", IIC_SprMFCRF>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
 def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
-                     "mfcr $rT", SprMFCR>,
+                     "mfcr $rT", IIC_SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
 } // neverHasSideEffects = 1
 
@@ -1962,18 +2087,18 @@ let usesCustomInserter = 1, Uses = [RM] in {
 // to manipulate FPSCR.  Note that FPSCR is not modeled at the DAG level.
 let Uses = [RM], Defs = [RM] in { 
   def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
-                        "mtfsb0 $FM", IntMTFSB0, []>,
+                        "mtfsb0 $FM", IIC_IntMTFSB0, []>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
   def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
-                        "mtfsb1 $FM", IntMTFSB0, []>,
+                        "mtfsb1 $FM", IIC_IntMTFSB0, []>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
   def MTFSF  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
-                       "mtfsf $FM, $rT", IntMTFSB0, []>,
+                       "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
 let Uses = [RM] in {
   def MFFS   : XForm_42<63, 583, (outs f8rc:$rT), (ins),
-                         "mffs $rT", IntMFFS,
+                         "mffs $rT", IIC_IntMFFS,
                          [(set f64:$rT, (PPCmffs))]>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
@@ -1981,59 +2106,68 @@ let Uses = [RM] in {
 
 let PPC970_Unit = 1, neverHasSideEffects = 1 in {  // FXU Operations.
 // XO-Form instructions.  Arithmetic instructions that can set overflow bit
-//
+let isCommutable = 1 in
 defm ADD4  : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "add", "$rT, $rA, $rB", IntSimple,
+                       "add", "$rT, $rA, $rB", IIC_IntSimple,
                        [(set i32:$rT, (add i32:$rA, i32:$rB))]>;
+let isCodeGenOnly = 1 in
+def ADD4TLS  : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB),
+                       "add $rT, $rA, $rB", IIC_IntSimple,
+                       [(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>;
+let isCommutable = 1 in
 defm ADDC  : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                        "addc", "$rT, $rA, $rB", IntGeneral,
+                        "addc", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
                         PPC970_DGroup_Cracked;
+
 defm DIVW  : XOForm_1r<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "divw", "$rT, $rA, $rB", IntDivW,
+                       "divw", "$rT, $rA, $rB", IIC_IntDivW,
                        [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>,
                        PPC970_DGroup_First, PPC970_DGroup_Cracked;
 defm DIVWU : XOForm_1r<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "divwu", "$rT, $rA, $rB", IntDivW,
+                       "divwu", "$rT, $rA, $rB", IIC_IntDivW,
                        [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>,
                        PPC970_DGroup_First, PPC970_DGroup_Cracked;
+let isCommutable = 1 in {
 defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "mulhw", "$rT, $rA, $rB", IntMulHW,
+                       "mulhw", "$rT, $rA, $rB", IIC_IntMulHW,
                        [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
 defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "mulhwu", "$rT, $rA, $rB", IntMulHWU,
+                       "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
                        [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
 defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "mullw", "$rT, $rA, $rB", IntMulHW,
+                       "mullw", "$rT, $rA, $rB", IIC_IntMulHW,
                        [(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
+} // isCommutable
 defm SUBF  : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                       "subf", "$rT, $rA, $rB", IntGeneral,
+                       "subf", "$rT, $rA, $rB", IIC_IntGeneral,
                        [(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
 defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                        "subfc", "$rT, $rA, $rB", IntGeneral,
+                        "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
                         PPC970_DGroup_Cracked;
 defm NEG    : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA),
-                        "neg", "$rT, $rA", IntSimple,
+                        "neg", "$rT, $rA", IIC_IntSimple,
                         [(set i32:$rT, (ineg i32:$rA))]>;
 let Uses = [CARRY] in {
+let isCommutable = 1 in
 defm ADDE  : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                        "adde", "$rT, $rA, $rB", IntGeneral,
+                        "adde", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
 defm ADDME  : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA),
-                         "addme", "$rT, $rA", IntGeneral,
+                         "addme", "$rT, $rA", IIC_IntGeneral,
                          [(set i32:$rT, (adde i32:$rA, -1))]>;
 defm ADDZE  : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA),
-                         "addze", "$rT, $rA", IntGeneral,
+                         "addze", "$rT, $rA", IIC_IntGeneral,
                          [(set i32:$rT, (adde i32:$rA, 0))]>;
 defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
-                        "subfe", "$rT, $rA, $rB", IntGeneral,
+                        "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
 defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA),
-                         "subfme", "$rT, $rA", IntGeneral,
+                         "subfme", "$rT, $rA", IIC_IntGeneral,
                          [(set i32:$rT, (sube -1, i32:$rA))]>;
 defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
-                         "subfze", "$rT, $rA", IntGeneral,
+                         "subfze", "$rT, $rA", IIC_IntGeneral,
                          [(set i32:$rT, (sube 0, i32:$rA))]>;
 }
 }
@@ -2043,90 +2177,96 @@ defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
 //
 let PPC970_Unit = 3, neverHasSideEffects = 1 in {  // FPU Operations.
 let Uses = [RM] in {
+let isCommutable = 1 in {
   defm FMADD : AForm_1r<63, 29, 
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
-                      "fmadd", "$FRT, $FRA, $FRC, $FRB", FPFused,
+                      "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
   defm FMADDS : AForm_1r<59, 29,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
-                      "fmadds", "$FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
   defm FMSUB : AForm_1r<63, 28,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
-                      "fmsub", "$FRT, $FRA, $FRC, $FRB", FPFused,
+                      "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set f64:$FRT,
                             (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
   defm FMSUBS : AForm_1r<59, 28,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
-                      "fmsubs", "$FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT,
                             (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
   defm FNMADD : AForm_1r<63, 31,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
-                      "fnmadd", "$FRT, $FRA, $FRC, $FRB", FPFused,
+                      "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set f64:$FRT,
                             (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
   defm FNMADDS : AForm_1r<59, 31,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
-                      "fnmadds", "$FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT,
                             (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
   defm FNMSUB : AForm_1r<63, 30,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
-                      "fnmsub", "$FRT, $FRA, $FRC, $FRB", FPFused,
+                      "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
                                                  (fneg f64:$FRB))))]>;
   defm FNMSUBS : AForm_1r<59, 30,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
-                      "fnmsubs", "$FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
                                                  (fneg f32:$FRB))))]>;
+} // isCommutable
 }
 // FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
 // having 4 of these, force the comparison to always be an 8-byte double (code
 // should use an FMRSD if the input comparison value really wants to be a float)
 // and 4/8 byte forms for the result and operand type..
-let Interpretation64Bit = 1 in
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm FSELD : AForm_1r<63, 23,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
-                      "fsel", "$FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
 defm FSELS : AForm_1r<63, 23,
                       (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
-                      "fsel", "$FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
 let Uses = [RM] in {
+  let isCommutable = 1 in {
   defm FADD  : AForm_2r<63, 21,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
-                        "fadd", "$FRT, $FRA, $FRB", FPAddSub,
+                        "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
                         [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
   defm FADDS : AForm_2r<59, 21,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
-                        "fadds", "$FRT, $FRA, $FRB", FPGeneral,
+                        "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
                         [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
+  } // isCommutable
   defm FDIV  : AForm_2r<63, 18,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
-                        "fdiv", "$FRT, $FRA, $FRB", FPDivD,
+                        "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
                         [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
   defm FDIVS : AForm_2r<59, 18,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
-                        "fdivs", "$FRT, $FRA, $FRB", FPDivS,
+                        "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
                         [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
+  let isCommutable = 1 in {
   defm FMUL  : AForm_3r<63, 25,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
-                        "fmul", "$FRT, $FRA, $FRC", FPFused,
+                        "fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
                         [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
   defm FMULS : AForm_3r<59, 25,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
-                        "fmuls", "$FRT, $FRA, $FRC", FPGeneral,
+                        "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
                         [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
+  } // isCommutable
   defm FSUB  : AForm_2r<63, 20,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
-                        "fsub", "$FRT, $FRA, $FRB", FPAddSub,
+                        "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
                         [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
   defm FSUBS : AForm_2r<59, 20,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
-                        "fsubs", "$FRT, $FRA, $FRB", FPGeneral,
+                        "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
                         [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
   }
 }
@@ -2136,7 +2276,7 @@ let PPC970_Unit = 1 in {  // FXU Operations.
   let isSelect = 1 in
   def ISEL  : AForm_4<31, 15,
                      (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
-                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     "isel $rT, $rA, $rB, $cond", IIC_IntGeneral,
                      []>;
 }
 
@@ -2147,24 +2287,24 @@ let isCommutable = 1 in {
 // RLWIMI can be commuted if the rotate amount is zero.
 defm RLWIMI : MForm_2r<20, (outs gprc:$rA),
                        (ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB,
-                       u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME", IntRotate,
-                       []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">,
-                       NoEncode<"$rSi">;
+                       u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+                       IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+                       RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
 }
 let BaseName = "rlwinm" in {
 def RLWINM : MForm_2<21,
                      (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
-                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
                      []>, RecFormRel;
 let Defs = [CR0] in
 def RLWINMo : MForm_2<21,
                       (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
-                      "rlwinm. $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                      "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
                       []>, isDOT, RecFormRel, PPC970_DGroup_Cracked;
 }
 defm RLWNM  : MForm_2r<23, (outs gprc:$rA),
                        (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME),
-                       "rlwnm", "$rA, $rS, $rB, $MB, $ME", IntGeneral,
+                       "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
                        []>;
 }
 } // neverHasSideEffects = 1
@@ -2178,8 +2318,10 @@ def : Pat<(i32 imm:$imm),
           (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
 
 // Implement the 'not' operation with the NOR instruction.
-def NOT : Pat<(not i32:$in),
-              (NOR $in, $in)>;
+def i32not : OutPatFrag<(ops node:$in),
+                        (NOR $in, $in)>;
+def        : Pat<(not i32:$in),
+                 (i32not $in)>;
 
 // ADD an arbitrary immediate.
 def : Pat<(add i32:$in, imm:$imm),
@@ -2250,6 +2392,17 @@ def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
 def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
           (ADDIS $in, tblockaddress:$g)>;
 
+// Support for thread-local storage.
+def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
+                [(set i32:$rD, (PPCppc32GOT))]>;
+
+def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
+                        "#LDgotTprelL32",
+                        [(set i32:$rD,
+                          (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
+def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
+          (ADD4TLS $in, tglobaltlsaddr:$g)>;
+
 // Standard shifts.  These are represented separately from the real shifts above
 // so that we can distinguish between shifts that allow 5-bit and 6-bit shift
 // amounts.
@@ -2284,7 +2437,8 @@ def : Pat<(f64 (extloadf32 xaddr:$src)),
 def : Pat<(f64 (fextend f32:$src)),
           (COPY_TO_REGCLASS $src, F8RC)>;
 
-def : Pat<(atomic_fence (imm), (imm)), (SYNC 0)>;
+def : Pat<(atomic_fence (imm), (imm)), (SYNC 0)>, Requires<[IsNotBookE]>;
+def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[IsBookE]>;
 
 // Additional FNMSUB patterns: -a*c + b == -(a*c - b)
 def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
@@ -2304,52 +2458,561 @@ def : Pat<(fcopysign f32:$frB, f64:$frA),
 
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
+include "PPCInstrVSX.td"
+
+def crnot : OutPatFrag<(ops node:$in),
+                       (CRNOR $in, $in)>;
+def       : Pat<(not i1:$in),
+                (crnot $in)>;
+
+// Patterns for arithmetic i1 operations.
+def : Pat<(add i1:$a, i1:$b),
+          (CRXOR $a, $b)>;
+def : Pat<(sub i1:$a, i1:$b),
+          (CRXOR $a, $b)>;
+def : Pat<(mul i1:$a, i1:$b),
+          (CRAND $a, $b)>;
+
+// We're sometimes asked to materialize i1 -1, which is just 1 in this case
+// (-1 is used to mean all bits set).
+def : Pat<(i1 -1), (CRSET)>;
+
+// i1 extensions, implemented in terms of isel.
+def : Pat<(i32 (zext i1:$in)),
+          (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i32 (sext i1:$in)),
+          (SELECT_I4 $in, (LI -1), (LI 0))>;
+
+def : Pat<(i64 (zext i1:$in)),
+          (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+def : Pat<(i64 (sext i1:$in)),
+          (SELECT_I8 $in, (LI8 -1), (LI8 0))>;
+
+// FIXME: We should choose either a zext or a sext based on other constants
+// already around.
+def : Pat<(i32 (anyext i1:$in)),
+          (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i64 (anyext i1:$in)),
+          (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+
+// match setcc on i1 variables.
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)),
+          (CRANDC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)),
+          (CRANDC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)),
+          (CRORC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)),
+          (CRORC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)),
+          (CREQV $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)),
+          (CRORC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)),
+          (CRORC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)),
+          (CRANDC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)),
+          (CRANDC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)),
+          (CRXOR $s1, $s2)>;
+
+// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE,
+// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for
+// floating-point types.
+
+multiclass CRNotPat<dag pattern, dag result> {
+  def : Pat<pattern, (crnot result)>;
+  def : Pat<(not pattern), result>;
+
+  // We can also fold the crnot into an extension:
+  def : Pat<(i32 (zext pattern)),
+            (SELECT_I4 result, (LI 0), (LI 1))>;
+  def : Pat<(i32 (sext pattern)),
+            (SELECT_I4 result, (LI 0), (LI -1))>;
+
+  // We can also fold the crnot into an extension:
+  def : Pat<(i64 (zext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 1))>;
+  def : Pat<(i64 (sext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 -1))>;
+
+  // FIXME: We should choose either a zext or a sext based on other constants
+  // already around.
+  def : Pat<(i32 (anyext pattern)),
+            (SELECT_I4 result, (LI 0), (LI 1))>;
+
+  def : Pat<(i64 (anyext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 1))>;
+}
+
+// FIXME: Because of what seems like a bug in TableGen's type-inference code,
+// we need to write imm:$imm in the output patterns below, not just $imm, or
+// else the resulting matcher will not correctly add the immediate operand
+// (making it a register operand instead).
+
+// extended SETCC.
+multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag,
+                       OutPatFrag rfrag, OutPatFrag rfrag8> {
+  def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))),
+            (rfrag $s1)>;
+  def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))),
+            (rfrag8 $s1)>;
+  def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+  def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+
+  def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))),
+            (rfrag $s1)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))),
+            (rfrag8 $s1)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+  def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+}
+
+// Note that we do all inversions below with i(32|64)not, instead of using
+// (xori x, 1) because on the A2 nor has single-cycle latency while xori
+// has 2-cycle latency.
+
+defm : ExtSetCCPat<SETEQ,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (CNTLZW $in), 27, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (CNTLZD $in), 58, 63)> >;
+ 
+defm : ExtSetCCPat<SETNE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not (CNTLZD $in)), 58, 63)> >;
+                 
+defm : ExtSetCCPat<SETLT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM $in, 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL $in, 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM $in, 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL $in, 1, 63)> >;
+
+// SETCC for i32.
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+//   lis r2, 4660
+//   ori r2, r2, 22136
+//   cmpw cr0, r3, r2
+//   beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+//   xoris r0,r3,0x1234
+//   cmplwi cr0,r0,0x5678
+//   beq cr0,L6
+
+def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                  (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+// SETCC for i64.
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+//   lis r2, 4660
+//   ori r2, r2, 22136
+//   cmpd cr0, r3, r2
+//   beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+//   xoris r0,r3,0x1234
+//   cmpldi cr0,r0,0x5678
+//   beq cr0,L6
+
+def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                  (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+// SETCC for f32.
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+// SETCC for f64.
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+// match select on i1 variables:
+def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
+          (CROR (CRAND        $cond , $tval),
+                (CRAND (crnot $cond), $fval))>;
+
+// match selectcc on i1 variables:
+//   select (lhs == rhs), tval, fval is:
+//   ((lhs == rhs) & tval) | (!(lhs == rhs) & fval)
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)),
+           (CROR (CRAND (CRANDC $rhs, $lhs), $tval),
+                 (CRAND (CRORC  $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)),
+           (CROR (CRAND (CRORC  $rhs, $lhs), $tval),
+                 (CRAND (CRANDC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)),
+           (CROR (CRAND (CREQV $lhs, $rhs), $tval),
+                 (CRAND (CRXOR $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)),
+           (CROR (CRAND (CRORC  $lhs, $rhs), $tval),
+                 (CRAND (CRANDC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)),
+           (CROR (CRAND (CRANDC $lhs, $rhs), $tval),
+                 (CRAND (CRORC  $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)),
+           (CROR (CRAND (CREQV $lhs, $rhs), $fval),
+                 (CRAND (CRXOR $lhs, $rhs), $tval))>;
+
+// match selectcc on i1 variables with non-i1 output.
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)),
+          (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)),
+          (SELECT_I4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)),
+          (SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)),
+          (SELECT_I4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)),
+          (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)),
+          (SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)),
+          (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)),
+          (SELECT_I8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)),
+          (SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)),
+          (SELECT_I8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)),
+          (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
+          (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+          (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+          (SELECT_F4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+          (SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+          (SELECT_F4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+          (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+          (SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_F8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_F8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
+          (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)),
+          (SELECT_VRRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)),
+          (SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)),
+          (SELECT_VRRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)),
+          (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
+          (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;
 
+let usesCustomInserter = 1 in {
+def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+                             "#ANDIo_1_EQ_BIT",
+                             [(set i1:$dst, (trunc (not i32:$in)))]>;
+def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+                             "#ANDIo_1_GT_BIT",
+                             [(set i1:$dst, (trunc i32:$in))]>;
+
+def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+                              "#ANDIo_1_EQ_BIT8",
+                              [(set i1:$dst, (trunc (not i64:$in)))]>;
+def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+                              "#ANDIo_1_GT_BIT8",
+                              [(set i1:$dst, (trunc i64:$in))]>;
+}
+
+def : Pat<(i1 (not (trunc i32:$in))),
+           (ANDIo_1_EQ_BIT $in)>;
+def : Pat<(i1 (not (trunc i64:$in))),
+           (ANDIo_1_EQ_BIT8 $in)>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instructions used for assembler/disassembler only
 //
 
 def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
-                         "isync", SprISYNC, []>;
+                         "isync", IIC_SprISYNC, []>;
 
 def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
-                    "icbi $src", LdStICBI, []>;
+                    "icbi $src", IIC_LdStICBI, []>;
 
 def EIEIO : XForm_24_eieio<31, 854, (outs), (ins),
-                           "eieio", LdStLoad, []>;
+                           "eieio", IIC_LdStLoad, []>;
 
 def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
-                         "wait $L", LdStLoad, []>;
+                         "wait $L", IIC_LdStLoad, []>;
 
 def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
-                    "mtmsr $RS, $L", SprMTMSR>;
+                    "mtmsr $RS, $L", IIC_SprMTMSR>;
 
 def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
-                  "mfmsr $RT", SprMFMSR, []>;
+                  "mfmsr $RT", IIC_SprMFMSR, []>;
 
 def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
-                    "mtmsrd $RS, $L", SprMTMSRD>;
+                    "mtmsrd $RS, $L", IIC_SprMTMSRD>;
 
 def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
-                        "slbie $RB", SprSLBIE, []>;
+                        "slbie $RB", IIC_SprSLBIE, []>;
 
 def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
-                    "slbmte $RS, $RB", SprSLBMTE, []>;
+                    "slbmte $RS, $RB", IIC_SprSLBMTE, []>;
 
 def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
-                       "slbmfee $RT, $RB", SprSLBMFEE, []>;
+                       "slbmfee $RT, $RB", IIC_SprSLBMFEE, []>;
 
-def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", SprSLBIA, []>;
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
 
 def TLBSYNC : XForm_0<31, 566, (outs), (ins),
-                        "tlbsync", SprTLBSYNC, []>;
+                        "tlbsync", IIC_SprTLBSYNC, []>;
 
 def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
-                          "tlbiel $RB", SprTLBIEL, []>;
+                          "tlbiel $RB", IIC_SprTLBIEL, []>;
 
 def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
-                          "tlbie $RB,$RS", SprTLBIE, []>;
+                          "tlbie $RB,$RS", IIC_SprTLBIE, []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
@@ -2373,10 +3036,10 @@ class PPCAsmPseudo<string asm, dag iops>
 
 def : InstAlias<"sc", (SC 0)>;
 
-def : InstAlias<"sync", (SYNC 0)>;
-def : InstAlias<"msync", (SYNC 0)>;
-def : InstAlias<"lwsync", (SYNC 1)>;
-def : InstAlias<"ptesync", (SYNC 2)>;
+def : InstAlias<"sync", (SYNC 0)>, Requires<[IsNotBookE]>;
+def : InstAlias<"msync", (SYNC 0)>, Requires<[IsNotBookE]>;
+def : InstAlias<"lwsync", (SYNC 1)>, Requires<[IsNotBookE]>;
+def : InstAlias<"ptesync", (SYNC 2)>, Requires<[IsNotBookE]>;
 
 def : InstAlias<"wait", (WAIT 0)>;
 def : InstAlias<"waitrsv", (WAIT 1)>;
@@ -2565,19 +3228,19 @@ let PPC970_Unit = 7 in {
   let Defs = [CTR], Uses = [CTR, LR, RM] in
     def gBCLR : XLForm_2<19, 16, 0, (outs),
                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
-                         "bclr $bo, $bi, $bh", BrB, []>;
+                         "bclr $bo, $bi, $bh", IIC_BrB, []>;
   let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
     def gBCLRL : XLForm_2<19, 16, 1, (outs),
                           (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
-                          "bclrl $bo, $bi, $bh", BrB, []>;
+                          "bclrl $bo, $bi, $bh", IIC_BrB, []>;
   let Defs = [CTR], Uses = [CTR, LR, RM] in
     def gBCCTR : XLForm_2<19, 528, 0, (outs),
                           (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
-                          "bcctr $bo, $bi, $bh", BrB, []>;
+                          "bcctr $bo, $bi, $bh", IIC_BrB, []>;
   let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
     def gBCCTRL : XLForm_2<19, 528, 1, (outs),
                            (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
-                           "bcctrl $bo, $bi, $bh", BrB, []>;
+                           "bcctrl $bo, $bi, $bh", IIC_BrB, []>;
 }
 def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
 def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
@@ -2620,14 +3283,14 @@ multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
                   (BCCA bibo, CR0, abscondbrtarget:$dst)>;
 
   def : InstAlias<"b"#name#"lr"#pm#" $cc",
-                  (BCLR bibo, crrc:$cc)>;
+                  (BCCLR bibo, crrc:$cc)>;
   def : InstAlias<"b"#name#"lr"#pm,
-                  (BCLR bibo, CR0)>;
+                  (BCCLR bibo, CR0)>;
 
   def : InstAlias<"b"#name#"ctr"#pm#" $cc",
-                  (BCCTR bibo, crrc:$cc)>;
+                  (BCCCTR bibo, crrc:$cc)>;
   def : InstAlias<"b"#name#"ctr"#pm,
-                  (BCCTR bibo, CR0)>;
+                  (BCCCTR bibo, CR0)>;
 
   def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
                   (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
@@ -2640,14 +3303,14 @@ multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
                   (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
 
   def : InstAlias<"b"#name#"lrl"#pm#" $cc",
-                  (BCLRL bibo, crrc:$cc)>;
+                  (BCCLRL bibo, crrc:$cc)>;
   def : InstAlias<"b"#name#"lrl"#pm,
-                  (BCLRL bibo, CR0)>;
+                  (BCCLRL bibo, CR0)>;
 
   def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
-                  (BCCTRL bibo, crrc:$cc)>;
+                  (BCCCTRL bibo, crrc:$cc)>;
   def : InstAlias<"b"#name#"ctrl"#pm,
-                  (BCCTRL bibo, CR0)>;
+                  (BCCCTRL bibo, CR0)>;
 }
 multiclass BranchExtendedMnemonic<string name, int bibo> {
   defm : BranchExtendedMnemonicPM<name, "", bibo>;
@@ -2671,18 +3334,18 @@ def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
 def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
 def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
 def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
-def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>;
 def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
-def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>;
 def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
 
 def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
 def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
 def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
 def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
-def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>;
 def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
-def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
 def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
 
 multiclass TrapExtendedMnemonic<string name, int to> {
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
new file mode 100644
index 0000000..9cc919e
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -0,0 +1,816 @@
+//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the VSX extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def PPCRegVSRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsrc : RegisterOperand<VSRC> {
+  let ParserMatchClass = PPCRegVSRCAsmOperand;
+}
+
+def PPCRegVSFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsfrc : RegisterOperand<VSFRC> {
+  let ParserMatchClass = PPCRegVSFRCAsmOperand;
+}
+
+multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XX3Form_Rc<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>;
+    let Defs = [CR6] in
+    def o    : XX3Form_Rc<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT;
+  }
+}
+
+def HasVSX : Predicate<"PPCSubTarget.hasVSX()">;
+let Predicates = [HasVSX] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let neverHasSideEffects = 1 in { // VSX instructions don't have side effects.
+let Uses = [RM] in {
+
+  // Load indexed instructions
+  let mayLoad = 1, canFoldAsLoad = 1 in {
+    def LXSDX : XForm_1<31, 588,
+                        (outs vsfrc:$XT), (ins memrr:$src),
+                        "lxsdx $XT, $src", IIC_LdStLFD,
+                        [(set f64:$XT, (load xoaddr:$src))]>;
+
+    def LXVD2X : XForm_1<31, 844,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvd2x $XT, $src", IIC_LdStLFD,
+                         [(set v2f64:$XT, (load xoaddr:$src))]>;
+
+    def LXVDSX : XForm_1<31, 332,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvdsx $XT, $src", IIC_LdStLFD, []>;
+
+    def LXVW4X : XForm_1<31, 780,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvw4x $XT, $src", IIC_LdStLFD, []>;
+  }
+
+  // Store indexed instructions
+  let mayStore = 1 in {
+    def STXSDX : XX1Form<31, 716,
+                        (outs), (ins vsfrc:$XT, memrr:$dst),
+                        "stxsdx $XT, $dst", IIC_LdStSTFD,
+                        [(store f64:$XT, xoaddr:$dst)]>;
+
+    def STXVD2X : XX1Form<31, 972,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvd2x $XT, $dst", IIC_LdStSTFD,
+                         [(store v2f64:$XT, xoaddr:$dst)]>;
+
+    def STXVW4X : XX1Form<31, 908,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvw4x $XT, $dst", IIC_LdStSTFD, []>;
+  }
+
+  // Add/Mul Instructions
+  let isCommutable = 1 in {
+    def XSADDDP : XX3Form<60, 32,
+                          (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                          "xsadddp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>;
+    def XSMULDP : XX3Form<60, 48,
+                          (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                          "xsmuldp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>;
+
+    def XVADDDP : XX3Form<60, 96,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvadddp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>;
+
+    def XVADDSP : XX3Form<60, 64,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvaddsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>;
+
+    def XVMULDP : XX3Form<60, 112,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvmuldp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>;
+
+    def XVMULSP : XX3Form<60, 80,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvmulsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>;
+  }
+
+  // Subtract Instructions
+  def XSSUBDP : XX3Form<60, 40,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xssubdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>;
+
+  def XVSUBDP : XX3Form<60, 104,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvsubdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>;
+  def XVSUBSP : XX3Form<60, 72,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvsubsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>;
+
+  // FMA Instructions
+  let BaseName = "XSMADDADP" in {
+  let isCommutable = 1 in
+  def XSMADDADP : XX3Form<60, 33,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMADDMDP : XX3Form<60, 41,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSMSUBADP" in {
+  let isCommutable = 1 in
+  def XSMSUBADP : XX3Form<60, 49,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMSUBMDP : XX3Form<60, 57,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMADDADP" in {
+  let isCommutable = 1 in
+  def XSNMADDADP : XX3Form<60, 161,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMADDMDP : XX3Form<60, 169,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMSUBADP" in {
+  let isCommutable = 1 in
+  def XSNMSUBADP : XX3Form<60, 177,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMSUBMDP : XX3Form<60, 185,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMADDADP" in {
+  let isCommutable = 1 in
+  def XVMADDADP : XX3Form<60, 97,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMADDMDP : XX3Form<60, 105,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMADDASP" in {
+  let isCommutable = 1 in
+  def XVMADDASP : XX3Form<60, 65,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMADDMSP : XX3Form<60, 73,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMSUBADP" in {
+  let isCommutable = 1 in
+  def XVMSUBADP : XX3Form<60, 113,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMSUBMDP : XX3Form<60, 121,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMSUBASP" in {
+  let isCommutable = 1 in
+  def XVMSUBASP : XX3Form<60, 81,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMSUBMSP : XX3Form<60, 89,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMADDADP" in {
+  let isCommutable = 1 in
+  def XVNMADDADP : XX3Form<60, 225,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMADDMDP : XX3Form<60, 233,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMADDASP" in {
+  let isCommutable = 1 in
+  def XVNMADDASP : XX3Form<60, 193,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMADDMSP : XX3Form<60, 201,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMSUBADP" in {
+  let isCommutable = 1 in
+  def XVNMSUBADP : XX3Form<60, 241,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMSUBMDP : XX3Form<60, 249,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMSUBASP" in {
+  let isCommutable = 1 in
+  def XVNMSUBASP : XX3Form<60, 209,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMSUBMSP : XX3Form<60, 217,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  // Division Instructions
+  def XSDIVDP : XX3Form<60, 56,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsdivdp $XT, $XA, $XB", IIC_FPDivD,
+                        [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>;
+  def XSSQRTDP : XX2Form<60, 75,
+                        (outs vsfrc:$XT), (ins vsfrc:$XB),
+                        "xssqrtdp $XT, $XB", IIC_FPSqrtD,
+                        [(set f64:$XT, (fsqrt f64:$XB))]>;
+
+  def XSREDP : XX2Form<60, 90,
+                        (outs vsfrc:$XT), (ins vsfrc:$XB),
+                        "xsredp $XT, $XB", IIC_VecFP,
+                        [(set f64:$XT, (PPCfre f64:$XB))]>;
+  def XSRSQRTEDP : XX2Form<60, 74,
+                           (outs vsfrc:$XT), (ins vsfrc:$XB),
+                           "xsrsqrtedp $XT, $XB", IIC_VecFP,
+                           [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+  def XSTDIVDP : XX3Form_1<60, 61,
+                         (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                         "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSTSQRTDP : XX2Form_1<60, 106,
+                          (outs crrc:$crD), (ins vsfrc:$XB),
+                          "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+
+  def XVDIVDP : XX3Form<60, 120,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvdivdp $XT, $XA, $XB", IIC_FPDivD,
+                        [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>;
+  def XVDIVSP : XX3Form<60, 88,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvdivsp $XT, $XA, $XB", IIC_FPDivS,
+                        [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>;
+
+  def XVSQRTDP : XX2Form<60, 203,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvsqrtdp $XT, $XB", IIC_FPSqrtD,
+                        [(set v2f64:$XT, (fsqrt v2f64:$XB))]>;
+  def XVSQRTSP : XX2Form<60, 139,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvsqrtsp $XT, $XB", IIC_FPSqrtS,
+                        [(set v4f32:$XT, (fsqrt v4f32:$XB))]>;
+
+  def XVTDIVDP : XX3Form_1<60, 125,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XVTDIVSP : XX3Form_1<60, 93,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  def XVTSQRTDP : XX2Form_1<60, 234,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
+  def XVTSQRTSP : XX2Form_1<60, 170,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;
+
+  def XVREDP : XX2Form<60, 218,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvredp $XT, $XB", IIC_VecFP,
+                        [(set v2f64:$XT, (PPCfre v2f64:$XB))]>;
+  def XVRESP : XX2Form<60, 154,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvresp $XT, $XB", IIC_VecFP,
+                        [(set v4f32:$XT, (PPCfre v4f32:$XB))]>;
+
+  def XVRSQRTEDP : XX2Form<60, 202,
+                           (outs vsrc:$XT), (ins vsrc:$XB),
+                           "xvrsqrtedp $XT, $XB", IIC_VecFP,
+                           [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>;
+  def XVRSQRTESP : XX2Form<60, 138,
+                           (outs vsrc:$XT), (ins vsrc:$XB),
+                           "xvrsqrtesp $XT, $XB", IIC_VecFP,
+                           [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>;
+
+  // Compare Instructions
+  def XSCMPODP : XX3Form_1<60, 43,
+                           (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                           "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSCMPUDP : XX3Form_1<60, 35,
+                           (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                           "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPEQSP : XX3Form_Rcr<60, 67,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGEDP : XX3Form_Rcr<60, 115,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGESP : XX3Form_Rcr<60, 83,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGTDP : XX3Form_Rcr<60, 107,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+  defm XVCMPGTSP : XX3Form_Rcr<60, 75,
+                             (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                             "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, []>;
+
+  // Move Instructions
+  def XSABSDP : XX2Form<60, 345,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsabsdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fabs f64:$XB))]>;
+  def XSNABSDP : XX2Form<60, 361,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsnabsdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fneg (fabs f64:$XB)))]>;
+  def XSNEGDP : XX2Form<60, 377,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsnegdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fneg f64:$XB))]>;
+  def XSCPSGNDP : XX3Form<60, 176,
+                      (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                      "xscpsgndp $XT, $XA, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>;
+
+  def XVABSDP : XX2Form<60, 473,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvabsdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fabs v2f64:$XB))]>;
+
+  def XVABSSP : XX2Form<60, 409,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvabssp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fabs v4f32:$XB))]>;
+
+  def XVCPSGNDP : XX3Form<60, 240,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xvcpsgndp $XT, $XA, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>;
+  def XVCPSGNSP : XX3Form<60, 208,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>;
+
+  def XVNABSDP : XX2Form<60, 489,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnabsdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>;
+  def XVNABSSP : XX2Form<60, 425,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnabssp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>;
+
+  def XVNEGDP : XX2Form<60, 505,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnegdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fneg v2f64:$XB))]>;
+  def XVNEGSP : XX2Form<60, 441,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnegsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fneg v4f32:$XB))]>;
+
+  // Conversion Instructions
+  def XSCVDPSP : XX2Form<60, 265,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+  def XSCVDPSXDS : XX2Form<60, 344,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctidz f64:$XB))]>;
+  def XSCVDPSXWS : XX2Form<60, 88,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsxws $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiwz f64:$XB))]>;
+  def XSCVDPUXDS : XX2Form<60, 328,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiduz f64:$XB))]>;
+  def XSCVDPUXWS : XX2Form<60, 72,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpuxws $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
+  def XSCVSPDP : XX2Form<60, 329,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvspdp $XT, $XB", IIC_VecFP, []>;
+  def XSCVSXDDP : XX2Form<60, 376,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvsxddp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfcfid f64:$XB))]>;
+  def XSCVUXDDP : XX2Form<60, 360,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvuxddp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfcfidu f64:$XB))]>;
+
+  def XVCVDPSP : XX2Form<60, 393,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVDPSXDS : XX2Form<60, 472,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
+  def XVCVDPSXWS : XX2Form<60, 216,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsxws $XT, $XB", IIC_VecFP, []>;
+  def XVCVDPUXDS : XX2Form<60, 456,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
+  def XVCVDPUXWS : XX2Form<60, 200,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpuxws $XT, $XB", IIC_VecFP, []>;
+
+  def XVCVSPDP : XX2Form<60, 457,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspdp $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPSXDS : XX2Form<60, 408,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspsxds $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPSXWS : XX2Form<60, 152,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspsxws $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPUXDS : XX2Form<60, 392,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspuxds $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPUXWS : XX2Form<60, 136,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspuxws $XT, $XB", IIC_VecFP, []>;
+  def XVCVSXDDP : XX2Form<60, 504,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxddp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
+  def XVCVSXDSP : XX2Form<60, 440,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxdsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVSXWDP : XX2Form<60, 248,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxwdp $XT, $XB", IIC_VecFP, []>;
+  def XVCVSXWSP : XX2Form<60, 184,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxwsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVUXDDP : XX2Form<60, 488,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxddp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
+  def XVCVUXDSP : XX2Form<60, 424,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxdsp $XT, $XB", IIC_VecFP, []>;
+  def XVCVUXWDP : XX2Form<60, 232,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwdp $XT, $XB", IIC_VecFP, []>;
+  def XVCVUXWSP : XX2Form<60, 168,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwsp $XT, $XB", IIC_VecFP, []>;
+
+  // Rounding Instructions
+  def XSRDPI : XX2Form<60, 73,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpi $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (frnd f64:$XB))]>;
+  def XSRDPIC : XX2Form<60, 107,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpic $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+  def XSRDPIM : XX2Form<60, 121,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpim $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (ffloor f64:$XB))]>;
+  def XSRDPIP : XX2Form<60, 105,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpip $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fceil f64:$XB))]>;
+  def XSRDPIZ : XX2Form<60, 89,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpiz $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (ftrunc f64:$XB))]>;
+
+  def XVRDPI : XX2Form<60, 201,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpi $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (frnd v2f64:$XB))]>;
+  def XVRDPIC : XX2Form<60, 235,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpic $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+  def XVRDPIM : XX2Form<60, 249,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpim $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (ffloor v2f64:$XB))]>;
+  def XVRDPIP : XX2Form<60, 233,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpip $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fceil v2f64:$XB))]>;
+  def XVRDPIZ : XX2Form<60, 217,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpiz $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (ftrunc v2f64:$XB))]>;
+
+  def XVRSPI : XX2Form<60, 137,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspi $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (frnd v4f32:$XB))]>;
+  def XVRSPIC : XX2Form<60, 171,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspic $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+  def XVRSPIM : XX2Form<60, 185,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspim $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (ffloor v4f32:$XB))]>;
+  def XVRSPIP : XX2Form<60, 169,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspip $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fceil v4f32:$XB))]>;
+  def XVRSPIZ : XX2Form<60, 153,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspiz $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (ftrunc v4f32:$XB))]>;
+
+  // Max/Min Instructions
+  let isCommutable = 1 in {
+  def XSMAXDP : XX3Form<60, 160,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP, []>;
+  def XSMINDP : XX3Form<60, 168,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmindp $XT, $XA, $XB", IIC_VecFP, []>;
+
+  def XVMAXDP : XX3Form<60, 224,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP, []>;
+  def XVMINDP : XX3Form<60, 232,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmindp $XT, $XA, $XB", IIC_VecFP, []>;
+
+  def XVMAXSP : XX3Form<60, 192,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP, []>;
+  def XVMINSP : XX3Form<60, 200,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvminsp $XT, $XA, $XB", IIC_VecFP, []>;
+  } // isCommutable
+} // Uses = [RM]
+
+  // Logical Instructions
+  let isCommutable = 1 in
+  def XXLAND : XX3Form<60, 130,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxland $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>;
+  def XXLANDC : XX3Form<60, 138,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xxlandc $XT, $XA, $XB", IIC_VecGeneral,
+                        [(set v4i32:$XT, (and v4i32:$XA,
+                                              (vnot_ppc v4i32:$XB)))]>;
+  let isCommutable = 1 in {
+  def XXLNOR : XX3Form<60, 162,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlnor $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA,
+                                                   v4i32:$XB)))]>;
+  def XXLOR : XX3Form<60, 146,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xxlor $XT, $XA, $XB", IIC_VecGeneral,
+                      [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XXLORf: XX3Form<60, 146,
+                      (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                      "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>;
+  def XXLXOR : XX3Form<60, 154,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlxor $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
+  } // isCommutable
+
+  // Permutation Instructions
+  def XXMRGHW : XX3Form<60, 18,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>;
+  def XXMRGLW : XX3Form<60, 50,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>;
+
+  def XXPERMDI : XX3Form_2<60, 10,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
+                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+  def XXSEL : XX4Form<60, 3,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
+                      "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;
+
+  def XXSLDWI : XX3Form_2<60, 2,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW),
+                       "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, []>;
+  def XXSPLTW : XX2Form_2<60, 164,
+                       (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+} // neverHasSideEffects
+} // AddedComplexity
+
+def : InstAlias<"xvmovdp $XT, $XB",
+                (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+def : InstAlias<"xvmovsp $XT, $XB",
+                (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+def : InstAlias<"xxmrghd $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
+def : InstAlias<"xxmrgld $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+          (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
+
+def : Pat<(f64 (vector_extract v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+def : Pat<(f64 (vector_extract v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+
+// Additional fnmsub patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (XSNMSUBADP $B, $C, $A)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (XSNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B),
+          (XVNMSUBADP $B, $C, $A)>;
+def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B),
+          (XVNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+          (XVNMSUBASP $B, $C, $A)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+          (XVNMSUBASP $B, $C, $A)>;
+
+def : Pat<(v2f64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2i64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2f64 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v2i64 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+// sign extension patterns
+// To extend "in place" from v2i32 to v2i64, we have input data like:
+// | undef | i32 | undef | i32 |
+// but xvcvsxwdp expects the input in big-Endian format:
+// | i32 | undef | i32 | undef |
+// so we need to shift everything to the left by one i32 (word) before
+// the conversion.
+def : Pat<(sext_inreg v2i64:$C, v2i32),
+          (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>;
+def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
+          (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
+
+} // AddedComplexity
+} // HasVSX
+
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 5e3a48d..227919c 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -214,6 +214,10 @@ asm(
     ".text\n"
     ".align 2\n"
     ".globl PPC64CompilationCallback\n"
+#if _CALL_ELF == 2
+    ".type PPC64CompilationCallback,@function\n"
+"PPC64CompilationCallback:\n"
+#else
     ".section \".opd\",\"aw\",@progbits\n"
     ".align 3\n"
 "PPC64CompilationCallback:\n"
@@ -223,6 +227,7 @@ asm(
     ".align 4\n"
     ".type PPC64CompilationCallback,@function\n"
 ".L.PPC64CompilationCallback:\n"
+#endif
 #  else
 asm(
     ".text\n"
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index f61c8bf..029bb8a 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -19,11 +19,14 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
@@ -32,35 +35,40 @@ static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
 
 
 static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
+  const TargetMachine &TM = AP.TM;
+  Mangler *Mang = AP.Mang;
+  const DataLayout *DL = TM.getDataLayout();
   MCContext &Ctx = AP.OutContext;
 
   SmallString<128> Name;
+  StringRef Suffix;
+  if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB)
+    Suffix = "$stub";
+  else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
+    Suffix = "$non_lazy_ptr";
+
+  if (!Suffix.empty())
+    Name += DL->getPrivateGlobalPrefix();
+
+  unsigned PrefixLen = Name.size();
+
   if (!MO.isGlobal()) {
     assert(MO.isSymbol() && "Isn't a symbol reference");
-    Name += AP.MAI->getGlobalPrefix();
-    Name += MO.getSymbolName();
-  } else {    
+    Mang->getNameWithPrefix(Name, MO.getSymbolName());
+  } else {
     const GlobalValue *GV = MO.getGlobal();
-    bool isImplicitlyPrivate = false;
-    if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB ||
-        (MO.getTargetFlags() & PPCII::MO_NLP_FLAG))
-      isImplicitlyPrivate = true;
-    
-    AP.Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+    TM.getNameWithPrefix(Name, GV, *Mang);
   }
-  
+
+  unsigned OrigLen = Name.size() - PrefixLen;
+
+  Name += Suffix;
+  MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
+
   // If the target flags on the operand changes the name of the symbol, do that
   // before we return the symbol.
   if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB) {
-    Name += "$stub";
-    const char *PGP = AP.MAI->getPrivateGlobalPrefix();
-    const char *Prefix = "";
-    if (!Name.startswith(PGP)) {
-      // http://llvm.org/bugs/show_bug.cgi?id=15763
-      // all stubs and lazy_ptrs should be local symbols, which need leading 'L'
-      Prefix = PGP;
-    }
-    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Twine(Prefix) + Twine(Name));
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI(AP).getFnStubEntry(Sym);
     if (StubSym.getPointer())
@@ -72,10 +80,9 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
       StubValueTy(AP.getSymbol(MO.getGlobal()),
                   !MO.getGlobal()->hasInternalLinkage());
     } else {
-      Name.erase(Name.end()-5, Name.end());
       StubSym =
       MachineModuleInfoImpl::
-      StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
+      StubValueTy(Ctx.GetOrCreateSymbol(OrigName), false);
     }
     return Sym;
   }
@@ -83,9 +90,6 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
   // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
   // then add the suffix.
   if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
-    Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
-  
     MachineModuleInfoMachO &MachO = getMachOMMI(AP);
     
     MachineModuleInfoImpl::StubValueTy &StubSym =
@@ -101,7 +105,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     return Sym;
   }
   
-  return Ctx.GetOrCreateSymbol(Name.str());
+  return Sym;
 }
 
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 19ccbfc..4ff282e 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -230,12 +229,33 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case PPC::F8RCRegClassID:
   case PPC::F4RCRegClassID:
   case PPC::VRRCRegClassID:
+  case PPC::VFRCRegClassID:
+  case PPC::VSLRCRegClassID:
+  case PPC::VSHRCRegClassID:
     return 32 - DefaultSafety;
+  case PPC::VSRCRegClassID:
+  case PPC::VSFRCRegClassID:
+    return 64 - DefaultSafety;
   case PPC::CRRCRegClassID:
     return 8 - DefaultSafety;
   }
 }
 
+const TargetRegisterClass*
+PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)const {
+  if (Subtarget.hasVSX()) {
+    // With VSX, we can inflate various sub-register classes to the full VSX
+    // register set.
+
+    if (RC == &PPC::F8RCRegClass)
+      return &PPC::VSFRCRegClass;
+    else if (RC == &PPC::VRRCRegClass)
+      return &PPC::VSRCRegClass;
+  }
+
+  return TargetRegisterInfo::getLargestLegalSuperClass(RC);
+}
+
 //===----------------------------------------------------------------------===//
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
@@ -452,6 +472,127 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
+static unsigned getCRFromCRBit(unsigned SrcReg) {
+  unsigned Reg = 0;
+  if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
+      SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
+    Reg = PPC::CR0;
+  else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
+           SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
+    Reg = PPC::CR1;
+  else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
+           SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
+    Reg = PPC::CR2;
+  else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
+           SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
+    Reg = PPC::CR3;
+  else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
+           SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
+    Reg = PPC::CR4;
+  else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
+           SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
+    Reg = PPC::CR5;
+  else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
+           SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
+    Reg = PPC::CR6;
+  else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
+           SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
+    Reg = PPC::CR7;
+
+  assert(Reg != 0 && "Invalid CR bit register");
+  return Reg;
+}
+
+void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
+                                         unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CRBIT <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL),
+          getCRFromCRBit(SrcReg))
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+          .addReg(getCRFromCRBit(SrcReg));
+    
+  // If the saved register wasn't CR0LT, shift the bits left so that the bit to
+  // store is the first one. Mask all but that bit.
+  unsigned Reg1 = Reg;
+  Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+  // rlwinm rA, rA, ShiftBits, 0, 0.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+    .addReg(Reg1, RegState::Kill)
+    .addImm(getEncodingValue(SrcReg))
+    .addImm(0).addImm(0);
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CRBIT <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CRBIT does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
+
+  unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
+          .addReg(getCRFromCRBit(DestReg));
+
+  unsigned ShiftBits = getEncodingValue(DestReg);
+  // rlwimi r11, r10, 32-ShiftBits, ..., ...
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
+           .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill)
+           .addImm(ShiftBits ? 32-ShiftBits : 0)
+           .addImm(ShiftBits).addImm(ShiftBits);
+           
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
+          getCRFromCRBit(DestReg))
+            .addReg(RegO, RegState::Kill)
+	    // Make sure we have a use dependency all the way through this
+	    // sequence of instructions. We can't have the other bits in the CR
+	    // modified in between the mfocrf and the mtocrf.
+            .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
 void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
                                           unsigned FrameIndex) const {
   // Get the instruction.
@@ -595,6 +736,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else if (OpC == PPC::RESTORE_CR) {
     lowerCRRestore(II, FrameIndex);
     return;
+  } else if (OpC == PPC::SPILL_CRBIT) {
+    lowerCRBitSpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_CRBIT) {
+    lowerCRBitRestore(II, FrameIndex);
+    return;
   } else if (OpC == PPC::SPILL_VRSAVE) {
     lowerVRSAVESpilling(II, FrameIndex);
     return;
@@ -812,11 +959,8 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx).addImm(Offset);
 }
 
-void
-PPCRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
-                                   unsigned BaseReg, int64_t Offset) const {
-  MachineInstr &MI = *I;
-
+void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                        int64_t Offset) const {
   unsigned FIOperandNum = 0;
   while (!MI.getOperand(FIOperandNum).isFI()) {
     ++FIOperandNum;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index dd3bb40..c3e54b4 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -15,8 +15,8 @@
 #ifndef POWERPC32_REGISTERINFO_H
 #define POWERPC32_REGISTERINFO_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "PPC.h"
+#include "llvm/ADT/DenseMap.h"
 
 #define GET_REGINFO_HEADER
 #include "PPCGenRegisterInfo.inc"
@@ -40,6 +40,9 @@ public:
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
@@ -69,6 +72,10 @@ public:
                        unsigned FrameIndex) const;
   void lowerCRRestore(MachineBasicBlock::iterator II,
                       unsigned FrameIndex) const;
+  void lowerCRBitSpilling(MachineBasicBlock::iterator II,
+                          unsigned FrameIndex) const;
+  void lowerCRBitRestore(MachineBasicBlock::iterator II,
+                         unsigned FrameIndex) const;
   void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
                            unsigned FrameIndex) const;
   void lowerVRSAVERestore(MachineBasicBlock::iterator II,
@@ -85,8 +92,8 @@ public:
   void materializeFrameBaseRegister(MachineBasicBlock *MBB,
                                     unsigned BaseReg, int FrameIdx,
                                     int64_t Offset) const;
-  void resolveFrameIndex(MachineBasicBlock::iterator I,
-                         unsigned BaseReg, int64_t Offset) const;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const;
   bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
 
   // Debug information queries.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index d566e2c..e11f7d4 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -16,6 +16,8 @@ def sub_gt : SubRegIndex<1, 1>;
 def sub_eq : SubRegIndex<1, 2>;
 def sub_un : SubRegIndex<1, 3>;
 def sub_32 : SubRegIndex<32>;
+def sub_64 : SubRegIndex<64>;
+def sub_128 : SubRegIndex<128>;
 }
 
 
@@ -47,9 +49,36 @@ class FPR<bits<5> num, string n> : PPCReg<n> {
   let HWEncoding{4-0} = num;
 }
 
-// VR - One of the 32 128-bit vector registers
-class VR<bits<5> num, string n> : PPCReg<n> {
+// VF - One of the 32 64-bit floating-point subregisters of the vector
+// registers (used by VSX).
+class VF<bits<5> num, string n> : PPCReg<n> {
   let HWEncoding{4-0} = num;
+  let HWEncoding{5} = 1;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<VF SubReg, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
+  let HWEncoding{5} = 0;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
+// floating-point registers.
+class VSRL<FPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// VSRH - One of the 32 128-bit VSX registers that overlap with the vector
+// registers.
+class VSRH<VR SubReg, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
+  let HWEncoding{5} = 1;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_128];
 }
 
 // CR - One of the 8 4-bit condition registers
@@ -80,12 +109,27 @@ foreach Index = 0-31 in {
                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
 }
 
+// Floating-point vector subregisters (for VSX)
+foreach Index = 0-31 in {
+  def VF#Index : VF<Index, "vs" # !add(Index, 32)>;
+}
+
 // Vector registers
 foreach Index = 0-31 in {
-  def V#Index : VR<Index, "v"#Index>,
+  def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
                 DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
+// VSX registers
+foreach Index = 0-31 in {
+  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+                  DwarfRegAlias<!cast<FPR>("F"#Index)>;
+}
+foreach Index = 0-31 in {
+  def VSH#Index : VSRH<!cast<VR>("V"#Index), "vs" # !add(Index, 32)>,
+                  DwarfRegAlias<!cast<VR>("V"#Index)>;
+}
+
 // The reprsentation of r0 when treated as the constant 0.
 def ZERO  : GPR<0, "0">;
 def ZERO8 : GP8<ZERO, "0">;
@@ -204,17 +248,39 @@ def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
                              V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
                              V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
 
-def CRBITRC : RegisterClass<"PPC", [i32], 32,
-  (add CR0LT, CR0GT, CR0EQ, CR0UN,
-       CR1LT, CR1GT, CR1EQ, CR1UN,
-       CR2LT, CR2GT, CR2EQ, CR2UN,
+// VSX register classes (the allocation order mirrors that of the corresponding
+// subregister classes).
+def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add (sequence "VSL%u", 0, 13),
+                               (sequence "VSL%u", 31, 14))>;
+def VSHRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add VSH2, VSH3, VSH4, VSH5, VSH0, VSH1, VSH6, VSH7,
+			       VSH8, VSH9, VSH10, VSH11, VSH12, VSH13, VSH14,
+                               VSH15, VSH16, VSH17, VSH18, VSH19, VSH31, VSH30,
+                               VSH29, VSH28, VSH27, VSH26, VSH25, VSH24, VSH23,
+                               VSH22, VSH21, VSH20)>;
+def VSRC  : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add VSLRC, VSHRC)>;
+
+// Register classes for the 64-bit "scalar" VSX subregisters.
+def VFRC :  RegisterClass<"PPC", [f64], 64,
+                          (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7,
+                               VF8, VF9, VF10, VF11, VF12, VF13, VF14,
+                               VF15, VF16, VF17, VF18, VF19, VF31, VF30,
+                               VF29, VF28, VF27, VF26, VF25, VF24, VF23,
+                               VF22, VF21, VF20)>;
+def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
+
+def CRBITRC : RegisterClass<"PPC", [i1], 32,
+  (add CR2LT, CR2GT, CR2EQ, CR2UN,
        CR3LT, CR3GT, CR3EQ, CR3UN,
        CR4LT, CR4GT, CR4EQ, CR4UN,
        CR5LT, CR5GT, CR5EQ, CR5UN,
        CR6LT, CR6GT, CR6EQ, CR6UN,
-       CR7LT, CR7GT, CR7EQ, CR7UN)>
-{
-  let CopyCost = -1;
+       CR7LT, CR7GT, CR7EQ, CR7UN,
+       CR1LT, CR1GT, CR1EQ, CR1UN,
+       CR0LT, CR0GT, CR0EQ, CR0UN)> {
+  let Size = 32;
 }
 
 def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 92ba69c..1221d41 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -8,114 +8,106 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Functional units across PowerPC chips sets
-//
-def BPU    : FuncUnit; // Branch unit
-def SLU    : FuncUnit; // Store/load unit
-def SRU    : FuncUnit; // special register unit
-def IU1    : FuncUnit; // integer unit 1 (simple)
-def IU2    : FuncUnit; // integer unit 2 (complex)
-def FPU1   : FuncUnit; // floating point unit 1
-def FPU2   : FuncUnit; // floating point unit 2
-def VPU    : FuncUnit; // vector permutation unit
-def VIU1   : FuncUnit; // vector integer unit 1 (simple)
-def VIU2   : FuncUnit; // vector integer unit 2 (complex)
-def VFPU   : FuncUnit; // vector floating point unit
-
-//===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for PowerPC
 //
-def IntSimple    : InstrItinClass;
-def IntGeneral   : InstrItinClass;
-def IntCompare   : InstrItinClass;
-def IntDivD      : InstrItinClass;
-def IntDivW      : InstrItinClass;
-def IntMFFS      : InstrItinClass;
-def IntMFVSCR    : InstrItinClass;
-def IntMTFSB0    : InstrItinClass;
-def IntMTSRD     : InstrItinClass;
-def IntMulHD     : InstrItinClass;
-def IntMulHW     : InstrItinClass;
-def IntMulHWU    : InstrItinClass;
-def IntMulLI     : InstrItinClass;
-def IntRFID      : InstrItinClass;
-def IntRotateD   : InstrItinClass;
-def IntRotateDI  : InstrItinClass;
-def IntRotate    : InstrItinClass;
-def IntShift     : InstrItinClass;
-def IntTrapD     : InstrItinClass;
-def IntTrapW     : InstrItinClass;
-def BrB          : InstrItinClass;
-def BrCR         : InstrItinClass;
-def BrMCR        : InstrItinClass;
-def BrMCRX       : InstrItinClass;
-def LdStDCBA     : InstrItinClass;
-def LdStDCBF     : InstrItinClass;
-def LdStDCBI     : InstrItinClass;
-def LdStLoad     : InstrItinClass;
-def LdStLoadUpd  : InstrItinClass;
-def LdStStore    : InstrItinClass;
-def LdStStoreUpd : InstrItinClass;
-def LdStDSS      : InstrItinClass;
-def LdStICBI     : InstrItinClass;
-def LdStLD       : InstrItinClass;
-def LdStLDU      : InstrItinClass;
-def LdStLDARX    : InstrItinClass;
-def LdStLFD      : InstrItinClass;
-def LdStLFDU     : InstrItinClass;
-def LdStLHA      : InstrItinClass;
-def LdStLHAU     : InstrItinClass;
-def LdStLMW      : InstrItinClass;
-def LdStLVecX    : InstrItinClass;
-def LdStLWA      : InstrItinClass;
-def LdStLWARX    : InstrItinClass;
-def LdStSLBIA    : InstrItinClass;
-def LdStSLBIE    : InstrItinClass;
-def LdStSTD      : InstrItinClass;
-def LdStSTDCX    : InstrItinClass;
-def LdStSTDU     : InstrItinClass;
-def LdStSTFD     : InstrItinClass;
-def LdStSTFDU    : InstrItinClass;
-def LdStSTVEBX   : InstrItinClass;
-def LdStSTWCX    : InstrItinClass;
-def LdStSync     : InstrItinClass;
-def SprISYNC     : InstrItinClass;
-def SprMFSR      : InstrItinClass;
-def SprMTMSR     : InstrItinClass;
-def SprMTSR      : InstrItinClass;
-def SprTLBSYNC   : InstrItinClass;
-def SprMFCR      : InstrItinClass;
-def SprMFMSR     : InstrItinClass;
-def SprMFSPR     : InstrItinClass;
-def SprMFTB      : InstrItinClass;
-def SprMTSPR     : InstrItinClass;
-def SprMTSRIN    : InstrItinClass;
-def SprRFI       : InstrItinClass;
-def SprSC        : InstrItinClass;
-def FPGeneral    : InstrItinClass;
-def FPAddSub     : InstrItinClass;
-def FPCompare    : InstrItinClass;
-def FPDivD       : InstrItinClass;
-def FPDivS       : InstrItinClass;
-def FPFused      : InstrItinClass;
-def FPRes        : InstrItinClass;
-def FPSqrt       : InstrItinClass;
-def VecGeneral   : InstrItinClass;
-def VecFP        : InstrItinClass;
-def VecFPCompare : InstrItinClass;
-def VecComplex   : InstrItinClass;
-def VecPerm      : InstrItinClass;
-def VecFPRound   : InstrItinClass;
-def VecVSL       : InstrItinClass;
-def VecVSR       : InstrItinClass;
-def SprMTMSRD    : InstrItinClass;
-def SprSLIE      : InstrItinClass;
-def SprSLBIE     : InstrItinClass;
-def SprSLBMTE    : InstrItinClass;
-def SprSLBMFEE   : InstrItinClass;
-def SprSLBIA     : InstrItinClass;
-def SprTLBIEL    : InstrItinClass;
-def SprTLBIE     : InstrItinClass;
+def IIC_IntSimple    : InstrItinClass;
+def IIC_IntGeneral   : InstrItinClass;
+def IIC_IntCompare   : InstrItinClass;
+def IIC_IntDivD      : InstrItinClass;
+def IIC_IntDivW      : InstrItinClass;
+def IIC_IntMFFS      : InstrItinClass;
+def IIC_IntMFVSCR    : InstrItinClass;
+def IIC_IntMTFSB0    : InstrItinClass;
+def IIC_IntMTSRD     : InstrItinClass;
+def IIC_IntMulHD     : InstrItinClass;
+def IIC_IntMulHW     : InstrItinClass;
+def IIC_IntMulHWU    : InstrItinClass;
+def IIC_IntMulLI     : InstrItinClass;
+def IIC_IntRFID      : InstrItinClass;
+def IIC_IntRotateD   : InstrItinClass;
+def IIC_IntRotateDI  : InstrItinClass;
+def IIC_IntRotate    : InstrItinClass;
+def IIC_IntShift     : InstrItinClass;
+def IIC_IntTrapD     : InstrItinClass;
+def IIC_IntTrapW     : InstrItinClass;
+def IIC_BrB          : InstrItinClass;
+def IIC_BrCR         : InstrItinClass;
+def IIC_BrMCR        : InstrItinClass;
+def IIC_BrMCRX       : InstrItinClass;
+def IIC_LdStDCBA     : InstrItinClass;
+def IIC_LdStDCBF     : InstrItinClass;
+def IIC_LdStDCBI     : InstrItinClass;
+def IIC_LdStLoad     : InstrItinClass;
+def IIC_LdStLoadUpd  : InstrItinClass;
+def IIC_LdStLoadUpdX : InstrItinClass;
+def IIC_LdStStore    : InstrItinClass;
+def IIC_LdStStoreUpd : InstrItinClass;
+def IIC_LdStDSS      : InstrItinClass;
+def IIC_LdStICBI     : InstrItinClass;
+def IIC_LdStLD       : InstrItinClass;
+def IIC_LdStLDU      : InstrItinClass;
+def IIC_LdStLDUX     : InstrItinClass;
+def IIC_LdStLDARX    : InstrItinClass;
+def IIC_LdStLFD      : InstrItinClass;
+def IIC_LdStLFDU     : InstrItinClass;
+def IIC_LdStLFDUX    : InstrItinClass;
+def IIC_LdStLHA      : InstrItinClass;
+def IIC_LdStLHAU     : InstrItinClass;
+def IIC_LdStLHAUX    : InstrItinClass;
+def IIC_LdStLMW      : InstrItinClass;
+def IIC_LdStLVecX    : InstrItinClass;
+def IIC_LdStLWA      : InstrItinClass;
+def IIC_LdStLWARX    : InstrItinClass;
+def IIC_LdStSLBIA    : InstrItinClass;
+def IIC_LdStSLBIE    : InstrItinClass;
+def IIC_LdStSTD      : InstrItinClass;
+def IIC_LdStSTDCX    : InstrItinClass;
+def IIC_LdStSTDU     : InstrItinClass;
+def IIC_LdStSTDUX    : InstrItinClass;
+def IIC_LdStSTFD     : InstrItinClass;
+def IIC_LdStSTFDU    : InstrItinClass;
+def IIC_LdStSTVEBX   : InstrItinClass;
+def IIC_LdStSTWCX    : InstrItinClass;
+def IIC_LdStSync     : InstrItinClass;
+def IIC_SprISYNC     : InstrItinClass;
+def IIC_SprMFSR      : InstrItinClass;
+def IIC_SprMTMSR     : InstrItinClass;
+def IIC_SprMTSR      : InstrItinClass;
+def IIC_SprTLBSYNC   : InstrItinClass;
+def IIC_SprMFCR      : InstrItinClass;
+def IIC_SprMFCRF     : InstrItinClass;
+def IIC_SprMFMSR     : InstrItinClass;
+def IIC_SprMFSPR     : InstrItinClass;
+def IIC_SprMFTB      : InstrItinClass;
+def IIC_SprMTSPR     : InstrItinClass;
+def IIC_SprMTSRIN    : InstrItinClass;
+def IIC_SprRFI       : InstrItinClass;
+def IIC_SprSC        : InstrItinClass;
+def IIC_FPGeneral    : InstrItinClass;
+def IIC_FPAddSub     : InstrItinClass;
+def IIC_FPCompare    : InstrItinClass;
+def IIC_FPDivD       : InstrItinClass;
+def IIC_FPDivS       : InstrItinClass;
+def IIC_FPFused      : InstrItinClass;
+def IIC_FPRes        : InstrItinClass;
+def IIC_FPSqrtD      : InstrItinClass;
+def IIC_FPSqrtS      : InstrItinClass;
+def IIC_VecGeneral   : InstrItinClass;
+def IIC_VecFP        : InstrItinClass;
+def IIC_VecFPCompare : InstrItinClass;
+def IIC_VecComplex   : InstrItinClass;
+def IIC_VecPerm      : InstrItinClass;
+def IIC_VecFPRound   : InstrItinClass;
+def IIC_VecVSL       : InstrItinClass;
+def IIC_VecVSR       : InstrItinClass;
+def IIC_SprMTMSRD    : InstrItinClass;
+def IIC_SprSLIE      : InstrItinClass;
+def IIC_SprSLBIE     : InstrItinClass;
+def IIC_SprSLBMTE    : InstrItinClass;
+def IIC_SprSLBMFEE   : InstrItinClass;
+def IIC_SprSLBIA     : InstrItinClass;
+def IIC_SprTLBIEL    : InstrItinClass;
+def IIC_SprTLBIE     : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
@@ -125,6 +117,7 @@ include "PPCSchedule440.td"
 include "PPCScheduleG4.td"
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
+include "PPCScheduleP7.td"
 include "PPCScheduleA2.td"
 include "PPCScheduleE500mc.td"
 include "PPCScheduleE5500.td"
@@ -136,392 +129,392 @@ include "PPCScheduleE5500.td"
 //
 //    opcode     itinerary class
 //    ======     ===============
-//    add        IntSimple
-//    addc       IntGeneral
-//    adde       IntGeneral
-//    addi       IntSimple
-//    addic      IntGeneral
-//    addic.     IntGeneral
-//    addis      IntSimple
-//    addme      IntGeneral
-//    addze      IntGeneral
-//    and        IntSimple
-//    andc       IntSimple
-//    andi.      IntGeneral
-//    andis.     IntGeneral
-//    b          BrB
-//    bc         BrB
-//    bcctr      BrB
-//    bclr       BrB
-//    cmp        IntCompare
-//    cmpi       IntCompare
-//    cmpl       IntCompare
-//    cmpli      IntCompare
-//    cntlzd     IntRotateD
-//    cntlzw     IntGeneral
-//    crand      BrCR
-//    crandc     BrCR
-//    creqv      BrCR
-//    crnand     BrCR
-//    crnor      BrCR
-//    cror       BrCR
-//    crorc      BrCR
-//    crxor      BrCR
-//    dcba       LdStDCBA
-//    dcbf       LdStDCBF
-//    dcbi       LdStDCBI
-//    dcbst      LdStDCBF
-//    dcbt       LdStLoad
-//    dcbtst     LdStLoad
-//    dcbz       LdStDCBF
-//    divd       IntDivD
-//    divdu      IntDivD
-//    divw       IntDivW
-//    divwu      IntDivW
-//    dss        LdStDSS
-//    dst        LdStDSS
-//    dstst      LdStDSS
-//    eciwx      LdStLoad
-//    ecowx      LdStLoad
-//    eieio      LdStLoad
-//    eqv        IntSimple
-//    extsb      IntSimple
-//    extsh      IntSimple
-//    extsw      IntSimple
-//    fabs       FPGeneral
-//    fadd       FPAddSub
-//    fadds      FPGeneral
-//    fcfid      FPGeneral
-//    fcmpo      FPCompare
-//    fcmpu      FPCompare
-//    fctid      FPGeneral
-//    fctidz     FPGeneral
-//    fctiw      FPGeneral
-//    fctiwz     FPGeneral
-//    fdiv       FPDivD
-//    fdivs      FPDivS
-//    fmadd      FPFused
-//    fmadds     FPGeneral
-//    fmr        FPGeneral
-//    fmsub      FPFused
-//    fmsubs     FPGeneral
-//    fmul       FPFused
-//    fmuls      FPGeneral
-//    fnabs      FPGeneral
-//    fneg       FPGeneral
-//    fnmadd     FPFused
-//    fnmadds    FPGeneral
-//    fnmsub     FPFused
-//    fnmsubs    FPGeneral
-//    fres       FPRes
-//    frsp       FPGeneral
-//    frsqrte    FPGeneral
-//    fsel       FPGeneral
-//    fsqrt      FPSqrt
-//    fsqrts     FPSqrt
-//    fsub       FPAddSub
-//    fsubs      FPGeneral
-//    icbi       LdStICBI
-//    isync      SprISYNC
-//    lbz        LdStLoad
-//    lbzu       LdStLoadUpd
-//    lbzux      LdStLoadUpd
-//    lbzx       LdStLoad
-//    ld         LdStLD
-//    ldarx      LdStLDARX
-//    ldu        LdStLDU
-//    ldux       LdStLDU
-//    ldx        LdStLD
-//    lfd        LdStLFD
-//    lfdu       LdStLFDU
-//    lfdux      LdStLFDU
-//    lfdx       LdStLFD
-//    lfs        LdStLFD
-//    lfsu       LdStLFDU
-//    lfsux      LdStLFDU
-//    lfsx       LdStLFD
-//    lha        LdStLHA
-//    lhau       LdStLHAU
-//    lhaux      LdStLHAU
-//    lhax       LdStLHA
-//    lhbrx      LdStLoad
-//    lhz        LdStLoad
-//    lhzu       LdStLoadUpd
-//    lhzux      LdStLoadUpd
-//    lhzx       LdStLoad
-//    lmw        LdStLMW
-//    lswi       LdStLMW
-//    lswx       LdStLMW
-//    lvebx      LdStLVecX
-//    lvehx      LdStLVecX
-//    lvewx      LdStLVecX
-//    lvsl       LdStLVecX
-//    lvsr       LdStLVecX
-//    lvx        LdStLVecX
-//    lvxl       LdStLVecX
-//    lwa        LdStLWA
-//    lwarx      LdStLWARX
-//    lwaux      LdStLHAU
-//    lwax       LdStLHA
-//    lwbrx      LdStLoad
-//    lwz        LdStLoad
-//    lwzu       LdStLoadUpd
-//    lwzux      LdStLoadUpd
-//    lwzx       LdStLoad
-//    mcrf       BrMCR
-//    mcrfs      FPGeneral
-//    mcrxr      BrMCRX
-//    mfcr       SprMFCR
-//    mffs       IntMFFS
-//    mfmsr      SprMFMSR
-//    mfspr      SprMFSPR
-//    mfsr       SprMFSR
-//    mfsrin     SprMFSR
-//    mftb       SprMFTB
-//    mfvscr     IntMFVSCR
-//    mtcrf      BrMCRX
-//    mtfsb0     IntMTFSB0
-//    mtfsb1     IntMTFSB0
-//    mtfsf      IntMTFSB0
-//    mtfsfi     IntMTFSB0
-//    mtmsr      SprMTMSR
-//    mtmsrd     LdStLD
-//    mtspr      SprMTSPR
-//    mtsr       SprMTSR
-//    mtsrd      IntMTSRD
-//    mtsrdin    IntMTSRD
-//    mtsrin     SprMTSRIN
-//    mtvscr     IntMFVSCR
-//    mulhd      IntMulHD
-//    mulhdu     IntMulHD
-//    mulhw      IntMulHW
-//    mulhwu     IntMulHWU
-//    mulld      IntMulHD
-//    mulli      IntMulLI
-//    mullw      IntMulHW
-//    nand       IntSimple
-//    neg        IntSimple
-//    nor        IntSimple
-//    or         IntSimple
-//    orc        IntSimple
-//    ori        IntSimple
-//    oris       IntSimple
-//    rfi        SprRFI
-//    rfid       IntRFID
-//    rldcl      IntRotateD
-//    rldcr      IntRotateD
-//    rldic      IntRotateDI
-//    rldicl     IntRotateDI
-//    rldicr     IntRotateDI
-//    rldimi     IntRotateDI
-//    rlwimi     IntRotate
-//    rlwinm     IntGeneral
-//    rlwnm      IntGeneral
-//    sc         SprSC
-//    slbia      LdStSLBIA
-//    slbie      LdStSLBIE
-//    sld        IntRotateD
-//    slw        IntGeneral
-//    srad       IntRotateD
-//    sradi      IntRotateDI
-//    sraw       IntShift
-//    srawi      IntShift
-//    srd        IntRotateD
-//    srw        IntGeneral
-//    stb        LdStStore
-//    stbu       LdStStoreUpd
-//    stbux      LdStStoreUpd
-//    stbx       LdStStore
-//    std        LdStSTD
-//    stdcx.     LdStSTDCX
-//    stdu       LdStSTDU
-//    stdux      LdStSTDU
-//    stdx       LdStSTD
-//    stfd       LdStSTFD
-//    stfdu      LdStSTFDU
-//    stfdux     LdStSTFDU
-//    stfdx      LdStSTFD
-//    stfiwx     LdStSTFD
-//    stfs       LdStSTFD
-//    stfsu      LdStSTFDU
-//    stfsux     LdStSTFDU
-//    stfsx      LdStSTFD
-//    sth        LdStStore
-//    sthbrx     LdStStore
-//    sthu       LdStStoreUpd
-//    sthux      LdStStoreUpd
-//    sthx       LdStStore
-//    stmw       LdStLMW
-//    stswi      LdStLMW
-//    stswx      LdStLMW
-//    stvebx     LdStSTVEBX
-//    stvehx     LdStSTVEBX
-//    stvewx     LdStSTVEBX
-//    stvx       LdStSTVEBX
-//    stvxl      LdStSTVEBX
-//    stw        LdStStore
-//    stwbrx     LdStStore
-//    stwcx.     LdStSTWCX
-//    stwu       LdStStoreUpd
-//    stwux      LdStStoreUpd
-//    stwx       LdStStore
-//    subf       IntGeneral
-//    subfc      IntGeneral
-//    subfe      IntGeneral
-//    subfic     IntGeneral
-//    subfme     IntGeneral
-//    subfze     IntGeneral
-//    sync       LdStSync
-//    td         IntTrapD
-//    tdi        IntTrapD
-//    tlbia      LdStSLBIA
-//    tlbie      LdStDCBF
-//    tlbsync    SprTLBSYNC
-//    tw         IntTrapW
-//    twi        IntTrapW
-//    vaddcuw    VecGeneral
-//    vaddfp     VecFP
-//    vaddsbs    VecGeneral
-//    vaddshs    VecGeneral
-//    vaddsws    VecGeneral
-//    vaddubm    VecGeneral
-//    vaddubs    VecGeneral
-//    vadduhm    VecGeneral
-//    vadduhs    VecGeneral
-//    vadduwm    VecGeneral
-//    vadduws    VecGeneral
-//    vand       VecGeneral
-//    vandc      VecGeneral
-//    vavgsb     VecGeneral
-//    vavgsh     VecGeneral
-//    vavgsw     VecGeneral
-//    vavgub     VecGeneral
-//    vavguh     VecGeneral
-//    vavguw     VecGeneral
-//    vcfsx      VecFP
-//    vcfux      VecFP
-//    vcmpbfp    VecFPCompare
-//    vcmpeqfp   VecFPCompare
-//    vcmpequb   VecGeneral
-//    vcmpequh   VecGeneral
-//    vcmpequw   VecGeneral
-//    vcmpgefp   VecFPCompare
-//    vcmpgtfp   VecFPCompare
-//    vcmpgtsb   VecGeneral
-//    vcmpgtsh   VecGeneral
-//    vcmpgtsw   VecGeneral
-//    vcmpgtub   VecGeneral
-//    vcmpgtuh   VecGeneral
-//    vcmpgtuw   VecGeneral
-//    vctsxs     VecFP
-//    vctuxs     VecFP
-//    vexptefp   VecFP
-//    vlogefp    VecFP
-//    vmaddfp    VecFP
-//    vmaxfp     VecFPCompare
-//    vmaxsb     VecGeneral
-//    vmaxsh     VecGeneral
-//    vmaxsw     VecGeneral
-//    vmaxub     VecGeneral
-//    vmaxuh     VecGeneral
-//    vmaxuw     VecGeneral
-//    vmhaddshs  VecComplex
-//    vmhraddshs VecComplex
-//    vminfp     VecFPCompare
-//    vminsb     VecGeneral
-//    vminsh     VecGeneral
-//    vminsw     VecGeneral
-//    vminub     VecGeneral
-//    vminuh     VecGeneral
-//    vminuw     VecGeneral
-//    vmladduhm  VecComplex
-//    vmrghb     VecPerm
-//    vmrghh     VecPerm
-//    vmrghw     VecPerm
-//    vmrglb     VecPerm
-//    vmrglh     VecPerm
-//    vmrglw     VecPerm
-//    vmsubfp    VecFP
-//    vmsummbm   VecComplex
-//    vmsumshm   VecComplex
-//    vmsumshs   VecComplex
-//    vmsumubm   VecComplex
-//    vmsumuhm   VecComplex
-//    vmsumuhs   VecComplex
-//    vmulesb    VecComplex
-//    vmulesh    VecComplex
-//    vmuleub    VecComplex
-//    vmuleuh    VecComplex
-//    vmulosb    VecComplex
-//    vmulosh    VecComplex
-//    vmuloub    VecComplex
-//    vmulouh    VecComplex
-//    vnor       VecGeneral
-//    vor        VecGeneral
-//    vperm      VecPerm
-//    vpkpx      VecPerm
-//    vpkshss    VecPerm
-//    vpkshus    VecPerm
-//    vpkswss    VecPerm
-//    vpkswus    VecPerm
-//    vpkuhum    VecPerm
-//    vpkuhus    VecPerm
-//    vpkuwum    VecPerm
-//    vpkuwus    VecPerm
-//    vrefp      VecFPRound
-//    vrfim      VecFPRound
-//    vrfin      VecFPRound
-//    vrfip      VecFPRound
-//    vrfiz      VecFPRound
-//    vrlb       VecGeneral
-//    vrlh       VecGeneral
-//    vrlw       VecGeneral
-//    vrsqrtefp  VecFP
-//    vsel       VecGeneral
-//    vsl        VecVSL
-//    vslb       VecGeneral
-//    vsldoi     VecPerm
-//    vslh       VecGeneral
-//    vslo       VecPerm
-//    vslw       VecGeneral
-//    vspltb     VecPerm
-//    vsplth     VecPerm
-//    vspltisb   VecPerm
-//    vspltish   VecPerm
-//    vspltisw   VecPerm
-//    vspltw     VecPerm
-//    vsr        VecVSR
-//    vsrab      VecGeneral
-//    vsrah      VecGeneral
-//    vsraw      VecGeneral
-//    vsrb       VecGeneral
-//    vsrh       VecGeneral
-//    vsro       VecPerm
-//    vsrw       VecGeneral
-//    vsubcuw    VecGeneral
-//    vsubfp     VecFP
-//    vsubsbs    VecGeneral
-//    vsubshs    VecGeneral
-//    vsubsws    VecGeneral
-//    vsububm    VecGeneral
-//    vsububs    VecGeneral
-//    vsubuhm    VecGeneral
-//    vsubuhs    VecGeneral
-//    vsubuwm    VecGeneral
-//    vsubuws    VecGeneral
-//    vsum2sws   VecComplex
-//    vsum4sbs   VecComplex
-//    vsum4shs   VecComplex
-//    vsum4ubs   VecComplex
-//    vsumsws    VecComplex
-//    vupkhpx    VecPerm
-//    vupkhsb    VecPerm
-//    vupkhsh    VecPerm
-//    vupklpx    VecPerm
-//    vupklsb    VecPerm
-//    vupklsh    VecPerm
-//    vxor       VecGeneral
-//    xor        IntSimple
-//    xori       IntSimple
-//    xoris      IntSimple
+//    add        IIC_IntSimple
+//    addc       IIC_IntGeneral
+//    adde       IIC_IntGeneral
+//    addi       IIC_IntSimple
+//    addic      IIC_IntGeneral
+//    addic.     IIC_IntGeneral
+//    addis      IIC_IntSimple
+//    addme      IIC_IntGeneral
+//    addze      IIC_IntGeneral
+//    and        IIC_IntSimple
+//    andc       IIC_IntSimple
+//    andi.      IIC_IntGeneral
+//    andis.     IIC_IntGeneral
+//    b          IIC_BrB
+//    bc         IIC_BrB
+//    bcctr      IIC_BrB
+//    bclr       IIC_BrB
+//    cmp        IIC_IntCompare
+//    cmpi       IIC_IntCompare
+//    cmpl       IIC_IntCompare
+//    cmpli      IIC_IntCompare
+//    cntlzd     IIC_IntRotateD
+//    cntlzw     IIC_IntGeneral
+//    crand      IIC_BrCR
+//    crandc     IIC_BrCR
+//    creqv      IIC_BrCR
+//    crnand     IIC_BrCR
+//    crnor      IIC_BrCR
+//    cror       IIC_BrCR
+//    crorc      IIC_BrCR
+//    crxor      IIC_BrCR
+//    dcba       IIC_LdStDCBA
+//    dcbf       IIC_LdStDCBF
+//    dcbi       IIC_LdStDCBI
+//    dcbst      IIC_LdStDCBF
+//    dcbt       IIC_LdStLoad
+//    dcbtst     IIC_LdStLoad
+//    dcbz       IIC_LdStDCBF
+//    divd       IIC_IntDivD
+//    divdu      IIC_IntDivD
+//    divw       IIC_IntDivW
+//    divwu      IIC_IntDivW
+//    dss        IIC_LdStDSS
+//    dst        IIC_LdStDSS
+//    dstst      IIC_LdStDSS
+//    eciwx      IIC_LdStLoad
+//    ecowx      IIC_LdStLoad
+//    eieio      IIC_LdStLoad
+//    eqv        IIC_IntSimple
+//    extsb      IIC_IntSimple
+//    extsh      IIC_IntSimple
+//    extsw      IIC_IntSimple
+//    fabs       IIC_FPGeneral
+//    fadd       IIC_FPAddSub
+//    fadds      IIC_FPGeneral
+//    fcfid      IIC_FPGeneral
+//    fcmpo      IIC_FPCompare
+//    fcmpu      IIC_FPCompare
+//    fctid      IIC_FPGeneral
+//    fctidz     IIC_FPGeneral
+//    fctiw      IIC_FPGeneral
+//    fctiwz     IIC_FPGeneral
+//    fdiv       IIC_FPDivD
+//    fdivs      IIC_FPDivS
+//    fmadd      IIC_FPFused
+//    fmadds     IIC_FPGeneral
+//    fmr        IIC_FPGeneral
+//    fmsub      IIC_FPFused
+//    fmsubs     IIC_FPGeneral
+//    fmul       IIC_FPFused
+//    fmuls      IIC_FPGeneral
+//    fnabs      IIC_FPGeneral
+//    fneg       IIC_FPGeneral
+//    fnmadd     IIC_FPFused
+//    fnmadds    IIC_FPGeneral
+//    fnmsub     IIC_FPFused
+//    fnmsubs    IIC_FPGeneral
+//    fres       IIC_FPRes
+//    frsp       IIC_FPGeneral
+//    frsqrte    IIC_FPGeneral
+//    fsel       IIC_FPGeneral
+//    fsqrt      IIC_FPSqrtD
+//    fsqrts     IIC_FPSqrtS
+//    fsub       IIC_FPAddSub
+//    fsubs      IIC_FPGeneral
+//    icbi       IIC_LdStICBI
+//    isync      IIC_SprISYNC
+//    lbz        IIC_LdStLoad
+//    lbzu       IIC_LdStLoadUpd
+//    lbzux      IIC_LdStLoadUpdX
+//    lbzx       IIC_LdStLoad
+//    ld         IIC_LdStLD
+//    ldarx      IIC_LdStLDARX
+//    ldu        IIC_LdStLDU
+//    ldux       IIC_LdStLDUX
+//    ldx        IIC_LdStLD
+//    lfd        IIC_LdStLFD
+//    lfdu       IIC_LdStLFDU
+//    lfdux      IIC_LdStLFDUX
+//    lfdx       IIC_LdStLFD
+//    lfs        IIC_LdStLFD
+//    lfsu       IIC_LdStLFDU
+//    lfsux      IIC_LdStLFDUX
+//    lfsx       IIC_LdStLFD
+//    lha        IIC_LdStLHA
+//    lhau       IIC_LdStLHAU
+//    lhaux      IIC_LdStLHAUX
+//    lhax       IIC_LdStLHA
+//    lhbrx      IIC_LdStLoad
+//    lhz        IIC_LdStLoad
+//    lhzu       IIC_LdStLoadUpd
+//    lhzux      IIC_LdStLoadUpdX
+//    lhzx       IIC_LdStLoad
+//    lmw        IIC_LdStLMW
+//    lswi       IIC_LdStLMW
+//    lswx       IIC_LdStLMW
+//    lvebx      IIC_LdStLVecX
+//    lvehx      IIC_LdStLVecX
+//    lvewx      IIC_LdStLVecX
+//    lvsl       IIC_LdStLVecX
+//    lvsr       IIC_LdStLVecX
+//    lvx        IIC_LdStLVecX
+//    lvxl       IIC_LdStLVecX
+//    lwa        IIC_LdStLWA
+//    lwarx      IIC_LdStLWARX
+//    lwaux      IIC_LdStLHAUX
+//    lwax       IIC_LdStLHA
+//    lwbrx      IIC_LdStLoad
+//    lwz        IIC_LdStLoad
+//    lwzu       IIC_LdStLoadUpd
+//    lwzux      IIC_LdStLoadUpdX
+//    lwzx       IIC_LdStLoad
+//    mcrf       IIC_BrMCR
+//    mcrfs      IIC_FPGeneral
+//    mcrxr      IIC_BrMCRX
+//    mfcr       IIC_SprMFCR
+//    mffs       IIC_IntMFFS
+//    mfmsr      IIC_SprMFMSR
+//    mfspr      IIC_SprMFSPR
+//    mfsr       IIC_SprMFSR
+//    mfsrin     IIC_SprMFSR
+//    mftb       IIC_SprMFTB
+//    mfvscr     IIC_IntMFVSCR
+//    mtcrf      IIC_BrMCRX
+//    mtfsb0     IIC_IntMTFSB0
+//    mtfsb1     IIC_IntMTFSB0
+//    mtfsf      IIC_IntMTFSB0
+//    mtfsfi     IIC_IntMTFSB0
+//    mtmsr      IIC_SprMTMSR
+//    mtmsrd     IIC_LdStLD
+//    mtspr      IIC_SprMTSPR
+//    mtsr       IIC_SprMTSR
+//    mtsrd      IIC_IntMTSRD
+//    mtsrdin    IIC_IntMTSRD
+//    mtsrin     IIC_SprMTSRIN
+//    mtvscr     IIC_IntMFVSCR
+//    mulhd      IIC_IntMulHD
+//    mulhdu     IIC_IntMulHD
+//    mulhw      IIC_IntMulHW
+//    mulhwu     IIC_IntMulHWU
+//    mulld      IIC_IntMulHD
+//    mulli      IIC_IntMulLI
+//    mullw      IIC_IntMulHW
+//    nand       IIC_IntSimple
+//    neg        IIC_IntSimple
+//    nor        IIC_IntSimple
+//    or         IIC_IntSimple
+//    orc        IIC_IntSimple
+//    ori        IIC_IntSimple
+//    oris       IIC_IntSimple
+//    rfi        IIC_SprRFI
+//    rfid       IIC_IntRFID
+//    rldcl      IIC_IntRotateD
+//    rldcr      IIC_IntRotateD
+//    rldic      IIC_IntRotateDI
+//    rldicl     IIC_IntRotateDI
+//    rldicr     IIC_IntRotateDI
+//    rldimi     IIC_IntRotateDI
+//    rlwimi     IIC_IntRotate
+//    rlwinm     IIC_IntGeneral
+//    rlwnm      IIC_IntGeneral
+//    sc         IIC_SprSC
+//    slbia      IIC_LdStSLBIA
+//    slbie      IIC_LdStSLBIE
+//    sld        IIC_IntRotateD
+//    slw        IIC_IntGeneral
+//    srad       IIC_IntRotateD
+//    sradi      IIC_IntRotateDI
+//    sraw       IIC_IntShift
+//    srawi      IIC_IntShift
+//    srd        IIC_IntRotateD
+//    srw        IIC_IntGeneral
+//    stb        IIC_LdStStore
+//    stbu       IIC_LdStStoreUpd
+//    stbux      IIC_LdStStoreUpd
+//    stbx       IIC_LdStStore
+//    std        IIC_LdStSTD
+//    stdcx.     IIC_LdStSTDCX
+//    stdu       IIC_LdStSTDU
+//    stdux      IIC_LdStSTDUX
+//    stdx       IIC_LdStSTD
+//    stfd       IIC_LdStSTFD
+//    stfdu      IIC_LdStSTFDU
+//    stfdux     IIC_LdStSTFDU
+//    stfdx      IIC_LdStSTFD
+//    stfiwx     IIC_LdStSTFD
+//    stfs       IIC_LdStSTFD
+//    stfsu      IIC_LdStSTFDU
+//    stfsux     IIC_LdStSTFDU
+//    stfsx      IIC_LdStSTFD
+//    sth        IIC_LdStStore
+//    sthbrx     IIC_LdStStore
+//    sthu       IIC_LdStStoreUpd
+//    sthux      IIC_LdStStoreUpd
+//    sthx       IIC_LdStStore
+//    stmw       IIC_LdStLMW
+//    stswi      IIC_LdStLMW
+//    stswx      IIC_LdStLMW
+//    stvebx     IIC_LdStSTVEBX
+//    stvehx     IIC_LdStSTVEBX
+//    stvewx     IIC_LdStSTVEBX
+//    stvx       IIC_LdStSTVEBX
+//    stvxl      IIC_LdStSTVEBX
+//    stw        IIC_LdStStore
+//    stwbrx     IIC_LdStStore
+//    stwcx.     IIC_LdStSTWCX
+//    stwu       IIC_LdStStoreUpd
+//    stwux      IIC_LdStStoreUpd
+//    stwx       IIC_LdStStore
+//    subf       IIC_IntGeneral
+//    subfc      IIC_IntGeneral
+//    subfe      IIC_IntGeneral
+//    subfic     IIC_IntGeneral
+//    subfme     IIC_IntGeneral
+//    subfze     IIC_IntGeneral
+//    sync       IIC_LdStSync
+//    td         IIC_IntTrapD
+//    tdi        IIC_IntTrapD
+//    tlbia      IIC_LdStSLBIA
+//    tlbie      IIC_LdStDCBF
+//    tlbsync    IIC_SprTLBSYNC
+//    tw         IIC_IntTrapW
+//    twi        IIC_IntTrapW
+//    vaddcuw    IIC_VecGeneral
+//    vaddfp     IIC_VecFP
+//    vaddsbs    IIC_VecGeneral
+//    vaddshs    IIC_VecGeneral
+//    vaddsws    IIC_VecGeneral
+//    vaddubm    IIC_VecGeneral
+//    vaddubs    IIC_VecGeneral
+//    vadduhm    IIC_VecGeneral
+//    vadduhs    IIC_VecGeneral
+//    vadduwm    IIC_VecGeneral
+//    vadduws    IIC_VecGeneral
+//    vand       IIC_VecGeneral
+//    vandc      IIC_VecGeneral
+//    vavgsb     IIC_VecGeneral
+//    vavgsh     IIC_VecGeneral
+//    vavgsw     IIC_VecGeneral
+//    vavgub     IIC_VecGeneral
+//    vavguh     IIC_VecGeneral
+//    vavguw     IIC_VecGeneral
+//    vcfsx      IIC_VecFP
+//    vcfux      IIC_VecFP
+//    vcmpbfp    IIC_VecFPCompare
+//    vcmpeqfp   IIC_VecFPCompare
+//    vcmpequb   IIC_VecGeneral
+//    vcmpequh   IIC_VecGeneral
+//    vcmpequw   IIC_VecGeneral
+//    vcmpgefp   IIC_VecFPCompare
+//    vcmpgtfp   IIC_VecFPCompare
+//    vcmpgtsb   IIC_VecGeneral
+//    vcmpgtsh   IIC_VecGeneral
+//    vcmpgtsw   IIC_VecGeneral
+//    vcmpgtub   IIC_VecGeneral
+//    vcmpgtuh   IIC_VecGeneral
+//    vcmpgtuw   IIC_VecGeneral
+//    vctsxs     IIC_VecFP
+//    vctuxs     IIC_VecFP
+//    vexptefp   IIC_VecFP
+//    vlogefp    IIC_VecFP
+//    vmaddfp    IIC_VecFP
+//    vmaxfp     IIC_VecFPCompare
+//    vmaxsb     IIC_VecGeneral
+//    vmaxsh     IIC_VecGeneral
+//    vmaxsw     IIC_VecGeneral
+//    vmaxub     IIC_VecGeneral
+//    vmaxuh     IIC_VecGeneral
+//    vmaxuw     IIC_VecGeneral
+//    vmhaddshs  IIC_VecComplex
+//    vmhraddshs IIC_VecComplex
+//    vminfp     IIC_VecFPCompare
+//    vminsb     IIC_VecGeneral
+//    vminsh     IIC_VecGeneral
+//    vminsw     IIC_VecGeneral
+//    vminub     IIC_VecGeneral
+//    vminuh     IIC_VecGeneral
+//    vminuw     IIC_VecGeneral
+//    vmladduhm  IIC_VecComplex
+//    vmrghb     IIC_VecPerm
+//    vmrghh     IIC_VecPerm
+//    vmrghw     IIC_VecPerm
+//    vmrglb     IIC_VecPerm
+//    vmrglh     IIC_VecPerm
+//    vmrglw     IIC_VecPerm
+//    vmsubfp    IIC_VecFP
+//    vmsummbm   IIC_VecComplex
+//    vmsumshm   IIC_VecComplex
+//    vmsumshs   IIC_VecComplex
+//    vmsumubm   IIC_VecComplex
+//    vmsumuhm   IIC_VecComplex
+//    vmsumuhs   IIC_VecComplex
+//    vmulesb    IIC_VecComplex
+//    vmulesh    IIC_VecComplex
+//    vmuleub    IIC_VecComplex
+//    vmuleuh    IIC_VecComplex
+//    vmulosb    IIC_VecComplex
+//    vmulosh    IIC_VecComplex
+//    vmuloub    IIC_VecComplex
+//    vmulouh    IIC_VecComplex
+//    vnor       IIC_VecGeneral
+//    vor        IIC_VecGeneral
+//    vperm      IIC_VecPerm
+//    vpkpx      IIC_VecPerm
+//    vpkshss    IIC_VecPerm
+//    vpkshus    IIC_VecPerm
+//    vpkswss    IIC_VecPerm
+//    vpkswus    IIC_VecPerm
+//    vpkuhum    IIC_VecPerm
+//    vpkuhus    IIC_VecPerm
+//    vpkuwum    IIC_VecPerm
+//    vpkuwus    IIC_VecPerm
+//    vrefp      IIC_VecFPRound
+//    vrfim      IIC_VecFPRound
+//    vrfin      IIC_VecFPRound
+//    vrfip      IIC_VecFPRound
+//    vrfiz      IIC_VecFPRound
+//    vrlb       IIC_VecGeneral
+//    vrlh       IIC_VecGeneral
+//    vrlw       IIC_VecGeneral
+//    vrsqrtefp  IIC_VecFP
+//    vsel       IIC_VecGeneral
+//    vsl        IIC_VecVSL
+//    vslb       IIC_VecGeneral
+//    vsldoi     IIC_VecPerm
+//    vslh       IIC_VecGeneral
+//    vslo       IIC_VecPerm
+//    vslw       IIC_VecGeneral
+//    vspltb     IIC_VecPerm
+//    vsplth     IIC_VecPerm
+//    vspltisb   IIC_VecPerm
+//    vspltish   IIC_VecPerm
+//    vspltisw   IIC_VecPerm
+//    vspltw     IIC_VecPerm
+//    vsr        IIC_VecVSR
+//    vsrab      IIC_VecGeneral
+//    vsrah      IIC_VecGeneral
+//    vsraw      IIC_VecGeneral
+//    vsrb       IIC_VecGeneral
+//    vsrh       IIC_VecGeneral
+//    vsro       IIC_VecPerm
+//    vsrw       IIC_VecGeneral
+//    vsubcuw    IIC_VecGeneral
+//    vsubfp     IIC_VecFP
+//    vsubsbs    IIC_VecGeneral
+//    vsubshs    IIC_VecGeneral
+//    vsubsws    IIC_VecGeneral
+//    vsububm    IIC_VecGeneral
+//    vsububs    IIC_VecGeneral
+//    vsubuhm    IIC_VecGeneral
+//    vsubuhs    IIC_VecGeneral
+//    vsubuwm    IIC_VecGeneral
+//    vsubuws    IIC_VecGeneral
+//    vsum2sws   IIC_VecComplex
+//    vsum4sbs   IIC_VecComplex
+//    vsum4shs   IIC_VecComplex
+//    vsum4ubs   IIC_VecComplex
+//    vsumsws    IIC_VecComplex
+//    vupkhpx    IIC_VecPerm
+//    vupkhsb    IIC_VecPerm
+//    vupkhsh    IIC_VecPerm
+//    vupklpx    IIC_VecPerm
+//    vupklsb    IIC_VecPerm
+//    vupklsh    IIC_VecPerm
+//    vxor       IIC_VecGeneral
+//    xor        IIC_IntSimple
+//    xori       IIC_IntSimple
+//    xoris      IIC_IntSimple
 //
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 37b6eac..218fed2 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -26,43 +26,39 @@
 //===----------------------------------------------------------------------===//
 // Functional units on the PowerPC 440/450 chip sets
 //
-def IFTH1  : FuncUnit; // Fetch unit 1
-def IFTH2  : FuncUnit; // Fetch unit 2
-def PDCD1  : FuncUnit; // Decode unit 1
-def PDCD2  : FuncUnit; // Decode unit 2
-def DISS1  : FuncUnit; // Issue unit 1
-def DISS2  : FuncUnit; // Issue unit 2
-def LRACC  : FuncUnit; // Register access and dispatch for
-                       // the simple integer (J-pipe) and
-                       // load/store (L-pipe) pipelines
-def IRACC  : FuncUnit; // Register access and dispatch for
-                       // the complex integer (I-pipe) pipeline
-def FRACC  : FuncUnit; // Register access and dispatch for
-                       // the floating-point execution (F-pipe) pipeline
-def IEXE1  : FuncUnit; // Execution stage 1 for the I pipeline
-def IEXE2  : FuncUnit; // Execution stage 2 for the I pipeline
-def IWB    : FuncUnit; // Write-back unit for the I pipeline
-def JEXE1  : FuncUnit; // Execution stage 1 for the J pipeline
-def JEXE2  : FuncUnit; // Execution stage 2 for the J pipeline
-def JWB    : FuncUnit; // Write-back unit for the J pipeline
-def AGEN   : FuncUnit; // Address generation for the L pipeline
-def CRD    : FuncUnit; // D-cache access for the L pipeline
-def LWB    : FuncUnit; // Write-back unit for the L pipeline
-def FEXE1  : FuncUnit; // Execution stage 1 for the F pipeline
-def FEXE2  : FuncUnit; // Execution stage 2 for the F pipeline
-def FEXE3  : FuncUnit; // Execution stage 3 for the F pipeline
-def FEXE4  : FuncUnit; // Execution stage 4 for the F pipeline
-def FEXE5  : FuncUnit; // Execution stage 5 for the F pipeline
-def FEXE6  : FuncUnit; // Execution stage 6 for the F pipeline
-def FWB    : FuncUnit; // Write-back unit for the F pipeline
+def P440_DISS1  : FuncUnit; // Issue unit 1
+def P440_DISS2  : FuncUnit; // Issue unit 2
+def P440_LRACC  : FuncUnit; // Register access and dispatch for
+                            // the simple integer (J-pipe) and
+                            // load/store (L-pipe) pipelines
+def P440_IRACC  : FuncUnit; // Register access and dispatch for
+                            // the complex integer (I-pipe) pipeline
+def P440_FRACC  : FuncUnit; // Register access and dispatch for
+                            // the floating-point execution (F-pipe) pipeline
+def P440_IEXE1  : FuncUnit; // Execution stage 1 for the I pipeline
+def P440_IEXE2  : FuncUnit; // Execution stage 2 for the I pipeline
+def P440_IWB    : FuncUnit; // Write-back unit for the I pipeline
+def P440_JEXE1  : FuncUnit; // Execution stage 1 for the J pipeline
+def P440_JEXE2  : FuncUnit; // Execution stage 2 for the J pipeline
+def P440_JWB    : FuncUnit; // Write-back unit for the J pipeline
+def P440_AGEN   : FuncUnit; // Address generation for the L pipeline
+def P440_CRD    : FuncUnit; // D-cache access for the L pipeline
+def P440_LWB    : FuncUnit; // Write-back unit for the L pipeline
+def P440_FEXE1  : FuncUnit; // Execution stage 1 for the F pipeline
+def P440_FEXE2  : FuncUnit; // Execution stage 2 for the F pipeline
+def P440_FEXE3  : FuncUnit; // Execution stage 3 for the F pipeline
+def P440_FEXE4  : FuncUnit; // Execution stage 4 for the F pipeline
+def P440_FEXE5  : FuncUnit; // Execution stage 5 for the F pipeline
+def P440_FEXE6  : FuncUnit; // Execution stage 6 for the F pipeline
+def P440_FWB    : FuncUnit; // Write-back unit for the F pipeline
 
-def LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used
-                           // to make sure that no lwarx/stwcx.
-                           // instructions are issued while another
-                           // lwarx/stwcx. is in the L pipe.
+def P440_LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used
+                                // to make sure that no lwarx/stwcx.
+                                // instructions are issued while another
+                                // lwarx/stwcx. is in the L pipe.
 
-def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
-def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+def P440_GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+def P440_FPR_Bypass : Bypass; // The bypass for floating-point regs.
 
 // Notes:
 // Instructions are held in the FRACC, LRACC and IRACC pipeline
@@ -104,560 +100,500 @@ def FPR_Bypass : Bypass; // The bypass for floating-point regs.
 
 
 def PPC440Itineraries : ProcessorItineraries<
-  [IFTH1, IFTH2, PDCD1, PDCD2, DISS1, DISS2, FRACC,
-   IRACC, IEXE1, IEXE2, IWB, LRACC, JEXE1, JEXE2, JWB, AGEN, CRD, LWB,
-   FEXE1, FEXE2, FEXE3, FEXE4, FEXE5, FEXE6, FWB, LWARX_Hold],
-  [GPR_Bypass, FPR_Bypass], [
-  InstrItinData<IntSimple  , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC, LRACC]>,
-                               InstrStage<1, [IEXE1, JEXE1]>,
-                               InstrStage<1, [IEXE2, JEXE2]>,
-                               InstrStage<1, [IWB, JWB]>],
-                              [6, 4, 4],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC, LRACC]>,
-                               InstrStage<1, [IEXE1, JEXE1]>,
-                               InstrStage<1, [IEXE2, JEXE2]>,
-                               InstrStage<1, [IWB, JWB]>],
-                              [6, 4, 4],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC, LRACC]>,
-                               InstrStage<1, [IEXE1, JEXE1]>,
-                               InstrStage<1, [IEXE2, JEXE2]>,
-                               InstrStage<1, [IWB, JWB]>],
-                              [6, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntDivW     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<33, [IWB]>],
-                              [40, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMFFS     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [7, 4, 4],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [7, 4, 4],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHW    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHWU   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulLI    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC, LRACC]>,
-                               InstrStage<1, [IEXE1, JEXE1]>,
-                               InstrStage<1, [IEXE2, JEXE2]>,
-                               InstrStage<1, [IWB, JWB]>],
-                              [6, 4, 4],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntShift    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC, LRACC]>,
-                               InstrStage<1, [IEXE1, JEXE1]>,
-                               InstrStage<1, [IEXE2, JEXE2]>,
-                               InstrStage<1, [IWB, JWB]>],
-                              [6, 4, 4],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapW    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [6, 4],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrB         , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<BrCR        , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrMCR       , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrMCRX      , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4, 4],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBA    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBF    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBI    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLoad    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [9, 5],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [9, 5],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,                              
-  InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFD    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5, 5],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5, 5],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [9, 5, 5],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLFDU    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [9, 5, 5],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLHA     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLHAU    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLMW     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLWARX   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1]>,
-                               InstrStage<1, [IRACC], 0>,
-                               InstrStage<4, [LWARX_Hold], 0>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDU    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,                              
-  InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1]>,
-                               InstrStage<1, [IRACC], 0>,
-                               InstrStage<4, [LWARX_Hold], 0>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1]>,
-                               InstrStage<1, [IRACC], 0>,
-                               InstrStage<4, [LWARX_Hold], 0>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSync    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<3, [AGEN], 1>,
-                               InstrStage<2, [CRD],  1>,
-                               InstrStage<1, [LWB]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC], 0>,
-                               InstrStage<1, [LRACC], 0>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [FEXE1], 0>,
-                               InstrStage<1, [AGEN],  0>,
-                               InstrStage<1, [JEXE1], 0>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [FEXE2], 0>,
-                               InstrStage<1, [CRD],   0>,
-                               InstrStage<1, [JEXE2], 0>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<6, [FEXE3], 0>,
-                               InstrStage<6, [LWB],   0>,
-                               InstrStage<6, [JWB],   0>,
-                               InstrStage<6, [IWB]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [6, 4],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMTMSR    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [6, 4],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMTSR     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<3, [IWB]>],
-                              [9, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMFMSR    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [7, 4],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMFSPR    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<3, [IWB]>],
-                              [10, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMFTB     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<3, [IWB]>],
-                              [10, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSPR    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<3, [IWB]>],
-                              [10, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<3, [IWB]>],
-                              [10, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprRFI      , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprSC       , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [IRACC]>,
-                               InstrStage<1, [IEXE1]>,
-                               InstrStage<1, [IEXE2]>,
-                               InstrStage<1, [IWB]>],
-                              [8, 4],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<FPGeneral   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<1, [FWB]>],
-                              [10, 4, 4],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPAddSub    , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<1, [FWB]>],
-                              [10, 4, 4],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPCompare   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<1, [FWB]>],
-                              [10, 4, 4],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivD      , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<25, [FWB]>],
-                              [35, 4, 4],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivS      , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<13, [FWB]>],
-                              [23, 4, 4],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPFused     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<1, [FWB]>],
-                              [10, 4, 4, 4],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPRes       , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [FRACC]>,
-                               InstrStage<1, [FEXE1]>,
-                               InstrStage<1, [FEXE2]>,
-                               InstrStage<1, [FEXE3]>,
-                               InstrStage<1, [FEXE4]>,
-                               InstrStage<1, [FEXE5]>,
-                               InstrStage<1, [FEXE6]>,
-                               InstrStage<1, [FWB]>],
-                              [10, 4],
-                              [FPR_Bypass, FPR_Bypass]>
+  [P440_DISS1, P440_DISS2, P440_FRACC, P440_IRACC, P440_IEXE1, P440_IEXE2,
+   P440_IWB, P440_LRACC, P440_JEXE1, P440_JEXE2, P440_JWB, P440_AGEN, P440_CRD,
+   P440_LWB, P440_FEXE1, P440_FEXE2, P440_FEXE3, P440_FEXE4, P440_FEXE5,
+   P440_FEXE6, P440_FWB, P440_LWARX_Hold],
+  [P440_GPR_Bypass, P440_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<33, [P440_IWB]>],
+                                [36, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,        [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,       [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrMCR,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStore,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStICBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFD,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHA,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTD,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDCX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<3, [P440_AGEN], 1>,
+                                 InstrStage<2, [P440_CRD],  1>,
+                                 InstrStage<1, [P440_LWB]>]>,
+  InstrItinData<IIC_SprISYNC,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC], 0>,
+                                 InstrStage<1, [P440_LRACC], 0>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_FEXE1], 0>,
+                                 InstrStage<1, [P440_AGEN],  0>,
+                                 InstrStage<1, [P440_JEXE1], 0>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_FEXE2], 0>,
+                                 InstrStage<1, [P440_CRD],   0>,
+                                 InstrStage<1, [P440_JEXE2], 0>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<6, [P440_FEXE3], 0>,
+                                 InstrStage<6, [P440_LWB],   0>,
+                                 InstrStage<6, [P440_JWB],   0>,
+                                 InstrStage<6, [P440_IWB]>]>,
+  InstrItinData<IIC_SprMFSR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [5, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>]>,
+  InstrItinData<IIC_SprMFCR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprRFI,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprSC,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass, P440_FPR_Bypass,
+                                 P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<25, [P440_FWB]>],
+                                [31, 0, 0],
+                                [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<13, [P440_FWB]>],
+                                [19, 0, 0],
+                                [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass,
+                                 P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0],
+                                [P440_FPR_Bypass, P440_FPR_Bypass]>
 ]>;
+
+// ===---------------------------------------------------------------------===//
+// PPC440 machine model for scheduling and other instruction cost heuristics.
+
+def PPC440Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 instructions are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPC440Itineraries;
+}
+
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 1612cd2..1447696 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -14,8 +14,8 @@
 //===----------------------------------------------------------------------===//
 // Functional units on the PowerPC A2 chip sets
 //
-def XU     : FuncUnit; // XU pipeline
-def FU     : FuncUnit; // FI pipeline
+def A2_XU     : FuncUnit; // A2_XU pipeline
+def A2_FU     : FuncUnit; // FI pipeline
 
 //
 // This file defines the itinerary class data for the PPC A2 processor.
@@ -24,126 +24,140 @@ def FU     : FuncUnit; // FI pipeline
 
 
 def PPCA2Itineraries : ProcessorItineraries<
-  [XU, FU], [], [
-  InstrItinData<IntSimple   , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [XU]>],
-                              [2, 1, 1]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [XU]>],
-                              [2, 1, 1]>,
-  InstrItinData<IntDivW     , [InstrStage<1, [XU]>],
-                              [39, 1, 1]>,
-  InstrItinData<IntDivD     , [InstrStage<1, [XU]>],
-                              [71, 1, 1]>,
-  InstrItinData<IntMulHW    , [InstrStage<1, [XU]>],
-                              [5, 1, 1]>,
-  InstrItinData<IntMulHWU   , [InstrStage<1, [XU]>],
-                              [5, 1, 1]>,
-  InstrItinData<IntMulLI    , [InstrStage<1, [XU]>],
-                              [6, 1, 1]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [XU]>],
-                              [2, 1, 1]>,
-  InstrItinData<IntRotateD  , [InstrStage<1, [XU]>],
-                              [2, 1, 1]>,
-  InstrItinData<IntRotateDI , [InstrStage<1, [XU]>],
-                              [2, 1, 1]>,
-  InstrItinData<IntShift    , [InstrStage<1, [XU]>],
-                              [2, 1, 1]>,
-  InstrItinData<IntTrapW    , [InstrStage<1, [XU]>],
-                              [2, 1]>,
-  InstrItinData<IntTrapD    , [InstrStage<1, [XU]>],
-                              [2, 1]>,
-  InstrItinData<BrB         , [InstrStage<1, [XU]>],
-                              [6, 1, 1]>,
-  InstrItinData<BrCR        , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<BrMCR       , [InstrStage<1, [XU]>],
-                              [5, 1, 1]>,
-  InstrItinData<BrMCRX      , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStDCBA    , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStDCBF    , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStDCBI    , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStLoad    , [InstrStage<1, [XU]>],
-                              [6, 1, 1]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<1, [XU]>],
-                              [6, 8, 1, 1]>,
-  InstrItinData<LdStLDU     , [InstrStage<1, [XU]>],
-                              [6, 1, 1]>,
-  InstrItinData<LdStStore   , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<1, [XU]>],
-                              [2, 1, 1, 1]>,
-  InstrItinData<LdStICBI,     [InstrStage<1, [XU]>],
-                              [16, 1, 1]>,
-  InstrItinData<LdStSTFD    , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<1, [XU]>],
-                              [2, 1, 1, 1]>,
-  InstrItinData<LdStLFD     , [InstrStage<1, [XU]>],
-                              [7, 1, 1]>,
-  InstrItinData<LdStLFDU    , [InstrStage<1, [XU]>],
-                              [7, 9, 1, 1]>,
-  InstrItinData<LdStLHA     , [InstrStage<1, [XU]>],
-                              [6, 1, 1]>,
-  InstrItinData<LdStLHAU    , [InstrStage<1, [XU]>],
-                              [6, 8, 1, 1]>,
-  InstrItinData<LdStLWARX   , [InstrStage<1, [XU]>],
-                              [82, 1, 1]>, // L2 latency
-  InstrItinData<LdStSTD     , [InstrStage<1, [XU]>],
-                              [1, 1, 1]>,
-  InstrItinData<LdStSTDU    , [InstrStage<1, [XU]>],
-                              [2, 1, 1, 1]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<1, [XU]>],
-                              [82, 1, 1]>, // L2 latency
-  InstrItinData<LdStSTWCX   , [InstrStage<1, [XU]>],
-                              [82, 1, 1]>, // L2 latency
-  InstrItinData<LdStSync    , [InstrStage<1, [XU]>],
-                              [6]>,
-  InstrItinData<SprISYNC    , [InstrStage<1, [XU]>],
-                              [16]>,
-  InstrItinData<SprMTMSR    , [InstrStage<1, [XU]>],
-                              [16, 1]>,
-  InstrItinData<SprMFCR     , [InstrStage<1, [XU]>],
-                              [6, 1]>,
-  InstrItinData<SprMFMSR    , [InstrStage<1, [XU]>],
-                              [4, 1]>,
-  InstrItinData<SprMFSPR    , [InstrStage<1, [XU]>],
-                              [6, 1]>,
-  InstrItinData<SprMFTB     , [InstrStage<1, [XU]>],
-                              [4, 1]>,
-  InstrItinData<SprMTSPR    , [InstrStage<1, [XU]>],
-                              [6, 1]>,
-  InstrItinData<SprRFI      , [InstrStage<1, [XU]>],
-                              [16]>,
-  InstrItinData<SprSC       , [InstrStage<1, [XU]>],
-                              [16]>,
-  InstrItinData<FPGeneral   , [InstrStage<1, [FU]>],
-                              [6, 1, 1]>,
-  InstrItinData<FPAddSub    , [InstrStage<1, [FU]>],
-                              [6, 1, 1]>,
-  InstrItinData<FPCompare   , [InstrStage<1, [FU]>],
-                              [5, 1, 1]>,
-  InstrItinData<FPDivD      , [InstrStage<1, [FU]>],
-                              [72, 1, 1]>,
-  InstrItinData<FPDivS      , [InstrStage<1, [FU]>],
-                              [59, 1, 1]>,
-  InstrItinData<FPSqrt      , [InstrStage<1, [FU]>],
-                              [69, 1, 1]>,
-  InstrItinData<FPFused     , [InstrStage<1, [FU]>],
-                              [6, 1, 1, 1]>,
-  InstrItinData<FPRes       , [InstrStage<1, [FU]>],
-                              [6, 1]>
+  [A2_XU, A2_FU], [], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [A2_XU]>],
+                                 [39, 0, 0]>,
+  InstrItinData<IIC_IntDivD,     [InstrStage<1, [A2_XU]>],
+                                 [71, 0, 0]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntRotateD,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntRotateDI, [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0]>,
+  InstrItinData<IIC_IntTrapD,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLDU,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLDUX,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [A2_XU]>],
+                                 [16, 0, 0]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [A2_XU]>],
+                                 [7, 0, 0]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [A2_XU]>],
+                                 [7, 9, 0, 0]>,
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [A2_XU]>],
+                                 [7, 9, 0, 0]>,
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSTD,     [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [A2_XU]>],
+                                 [6]>,
+  InstrItinData<IIC_SprISYNC,    [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [A2_XU]>],
+                                 [16, 0]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [A2_XU]>],
+                                 [4, 0]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [A2_XU]>],
+                                 [4, 0]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprRFI,      [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_SprSC,       [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [A2_FU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [A2_FU]>],
+                                 [72, 0, 0]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [A2_FU]>],
+                                 [59, 0, 0]>,
+  InstrItinData<IIC_FPSqrtD,     [InstrStage<1, [A2_FU]>],
+                                 [69, 0, 0]>,
+  InstrItinData<IIC_FPSqrtS,     [InstrStage<1, [A2_FU]>],
+                                 [65, 0, 0]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0, 0]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [A2_FU]>],
+                                 [6, 0]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
 // A2 machine model for scheduling and other instruction cost heuristics.
 
 def PPCA2Model : SchedMachineModel {
-  let IssueWidth = 1;  // 2 micro-ops are dispatched per cycle.
+  let IssueWidth = 1;  // 1 instruction is dispatched per cycle.
   let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 6; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index c189b9e..dab89e3 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -19,238 +19,285 @@
 //  * Decode & Dispatch
 //    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
 //    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
-def DIS0 : FuncUnit; // Dispatch stage - insn 1
-def DIS1 : FuncUnit; // Dispatch stage - insn 2
+def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
 
 //  * Execute
 //    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
 //    Some instructions can only execute in SFX0 but not SFX1.
 //    The CFX has a bypass path, allowing non-divide instructions to execute 
 //    while a divide instruction is executed.
-def SFX0  : FuncUnit; // Simple unit 0
-def SFX1  : FuncUnit; // Simple unit 1
-def BU    : FuncUnit; // Branch unit
-def CFX_DivBypass 
-          : FuncUnit; // CFX divide bypass path
-def CFX_0 : FuncUnit; // CFX pipeline
-def LSU_0 : FuncUnit; // LSU pipeline
-def FPU_0 : FuncUnit; // FPU pipeline
+def E500_SFX0  : FuncUnit; // Simple unit 0
+def E500_SFX1  : FuncUnit; // Simple unit 1
+def E500_BU    : FuncUnit; // Branch unit
+def E500_CFX_DivBypass 
+               : FuncUnit; // CFX divide bypass path
+def E500_CFX_0 : FuncUnit; // CFX pipeline
+def E500_LSU_0 : FuncUnit; // LSU pipeline
+def E500_FPU_0 : FuncUnit; // FPU pipeline
 
-def CR_Bypass : Bypass;
+def E500_GPR_Bypass : Bypass;
+def E500_FPR_Bypass : Bypass;
+def E500_CR_Bypass  : Bypass;
 
 def PPCE500mcItineraries : ProcessorItineraries<
-  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
-  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
-  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1, 1], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1, 1], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [5, 1, 1], // Latency = 1 or 2
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<14, [CFX_DivBypass]>],
-                              [17, 1, 1], // Latency=4..35, Repeat= 4..35
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<8, [FPU_0]>],
-                              [11], // Latency = 8
-                              [FPR_Bypass]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<8, [FPU_0]>],
-                              [11, 1, 1], // Latency = 8
-                              [NoBypass, NoBypass, NoBypass]>,
-  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0]>],
-                              [7, 1, 1], // Latency = 4, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0]>],
-                              [7, 1, 1], // Latency = 4, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0]>],
-                              [7, 1, 1], // Latency = 4, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1, 1], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1, 1], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [SFX0]>],
-                              [5, 1], // Latency = 2, Repeat rate = 2
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [BU]>],
-                              [4, 1], // Latency = 1
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [BU]>],
-                              [4, 1, 1], // Latency = 1
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [BU]>],
-                              [4, 1], // Latency = 1
-                              [CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1, 1], // Latency = 1
-                              [CR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [NoBypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 1, 1], // Latency = 4
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 1, 1], // Latency = 4
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops
-  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 1], // Latency = r+3
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<3, [LSU_0]>],
-                              [6, 1, 1], // Latency = 3, Repeat rate = 3
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [6, 1], // Latency = 3
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [SFX0]>],
-                              [7, 1],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [SFX0, SFX1]>],
-                              [5, 1], // Latency = 2, Repeat rate = 4
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMTSR     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0]>],
-                              [5, 1],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0], 0>]>,
-  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<5, [SFX0]>],
-                              [8, 1],
-                              [GPR_Bypass, CR_Bypass]>,
-  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [SFX0]>],
-                              [7, 1], // Latency = 4, Repeat rate = 4
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1], // Latency = 1, Repeat rate = 1
-                              [GPR_Bypass, CR_Bypass]>,
-  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [SFX0]>],
-                              [7, 1], // Latency = 4, Repeat rate = 4
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [4, 1], // Latency = 1, Repeat rate = 1
-                              [CR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0]>],
-                              [4, 1],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [FPU_0]>],
-                              [11, 1, 1], // Latency = 8, Repeat rate = 2 
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [FPU_0]>],
-                              [13, 1, 1], // Latency = 10, Repeat rate = 4 
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
-  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [FPU_0]>],
-                              [11, 1, 1], // Latency = 8, Repeat rate = 2
-                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<68, [FPU_0]>],
-                              [71, 1, 1], // Latency = 68, Repeat rate = 68
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<38, [FPU_0]>],
-                              [41, 1, 1], // Latency = 38, Repeat rate = 38
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [FPU_0]>],
-                              [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<38, [FPU_0]>],
-                              [41, 1], // Latency = 38, Repeat rate = 38
-                              [FPR_Bypass, FPR_Bypass]>
+  [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass,
+   E500_CFX_0, E500_LSU_0, E500_FPU_0],
+  [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [5, 1, 1], // Latency = 1 or 2
+                                 [E500_CR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0], 0>,
+                                  InstrStage<14, [E500_CFX_DivBypass]>],
+                                 [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<8, [E500_FPU_0]>],
+                                 [11], // Latency = 8
+                                 [E500_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<8, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8
+                                 [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SFX0]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 2
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass,
+                                  E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1], // Latency = r+3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<3, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3, Repeat rate = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>]>,
+  InstrItinData<IIC_SprMFSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1],
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SFX0, E500_SFX1]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0]>],
+                                 [5, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SFX0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SFX0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0]>],
+                                 [4, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_FPU_0]>],
+                                 [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2
+                                 [E500_CR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<68, [E500_FPU_0]>],
+                                 [71, 1, 1], // Latency = 68, Repeat rate = 68
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<38, [E500_FPU_0]>],
+                                 [41, 1, 1], // Latency = 38, Repeat rate = 38
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_FPU_0]>],
+                                 [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass,
+                                  E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<38, [E500_FPU_0]>],
+                                 [41, 1], // Latency = 38, Repeat rate = 38
+                                 [E500_FPR_Bypass, E500_FPR_Bypass]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index 7a24d20..de097d9 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -20,280 +20,344 @@
 //  * Decode & Dispatch
 //    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
 //    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
-// def DIS0 : FuncUnit;
-// def DIS1 : FuncUnit;
+def E5500_DIS0 : FuncUnit;
+def E5500_DIS1 : FuncUnit;
 
 //  * Execute
 //    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
 //    The CFX has a bypass path, allowing non-divide instructions to execute 
 //    while a divide instruction is being executed.
-// def SFX0  : FuncUnit; // Simple unit 0
-// def SFX1  : FuncUnit; // Simple unit 1
-// def BU    : FuncUnit; // Branch unit
-// def CFX_DivBypass 
-//           : FuncUnit; // CFX divide bypass path
-// def CFX_0 : FuncUnit; // CFX pipeline stage 0
+def E5500_SFX0  : FuncUnit; // Simple unit 0
+def E5500_SFX1  : FuncUnit; // Simple unit 1
+def E5500_BU    : FuncUnit; // Branch unit
+def E5500_CFX_DivBypass 
+                : FuncUnit; // CFX divide bypass path
+def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0
 
-def CFX_1 : FuncUnit; // CFX pipeline stage 1 
+def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1 
 
-// def LSU_0 : FuncUnit; // LSU pipeline
-// def FPU_0 : FuncUnit; // FPU pipeline
+def E5500_LSU_0 : FuncUnit; // LSU pipeline
+def E5500_FPU_0 : FuncUnit; // FPU pipeline
 
-// def CR_Bypass : Bypass;
+def E5500_GPR_Bypass : Bypass;
+def E5500_FPR_Bypass : Bypass;
+def E5500_CR_Bypass  : Bypass;
 
 def PPCE5500Itineraries : ProcessorItineraries<
-  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
-   LSU_0, FPU_0],
-  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
-  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [5, 2, 2], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [5, 2, 2], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [6, 2, 2], // Latency = 1 or 2
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntDivD     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<26, [CFX_DivBypass]>],
-                              [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<16, [CFX_DivBypass]>],
-                              [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [FPU_0]>],
-                              [11], // Latency = 7, Repeat rate = 1
-                              [FPR_Bypass]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<7, [FPU_0]>],
-                              [11, 2, 2], // Latency = 7, Repeat rate = 7
-                              [NoBypass, NoBypass, NoBypass]>,
-  InstrItinData<IntMulHD    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<2, [CFX_1]>],
-                              [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<1, [CFX_1]>],
-                              [8, 2, 2], // Latency = 4, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<1, [CFX_1]>],
-                              [8, 2, 2], // Latency = 4, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0], 0>,
-                               InstrStage<2, [CFX_1]>],
-                              [8, 2, 2], // Latency = 4 or 5, Repeat = 2
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [5, 2, 2], // Latency = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateD  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [SFX0, SFX1]>],
-                              [6, 2, 2], // Latency = 2, Repeat rate = 2
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateDI , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [5, 2, 2], // Latency = 1, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                                                            
-  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [SFX0, SFX1]>],
-                              [6, 2, 2], // Latency = 2, Repeat rate = 2
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [SFX0]>],
-                              [6, 2], // Latency = 2, Repeat rate = 2
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [BU]>],
-                              [5, 2], // Latency = 1
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [BU]>],
-                              [5, 2, 2], // Latency = 1
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [BU]>],
-                              [5, 2], // Latency = 1
-                              [CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0]>],
-                              [5, 2, 2], // Latency = 1
-                              [CR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops
-  InstrItinData<LdStLD      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLDARX   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<3, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 3
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLDU     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops
-  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [NoBypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [8, 2, 2], // Latency = 4, Repeat rate = 1
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops
-  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [8, 2, 2], // Latency = 4, Repeat rate = 1
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops
-  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [GPR_Bypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [LSU_0]>],
-                              [8, 2], // Latency = r+3, Repeat rate = r+3
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<3, [LSU_0]>],
-                              [7, 2, 2], // Latency = 3, Repeat rate = 3
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1                              
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1                              
-                              [NoBypass, GPR_Bypass]>,                              
-  InstrItinData<LdStSTDU    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [NoBypass, GPR_Bypass],
-                              2>, // 2 micro-ops                              
-  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>],
-                              [7, 2], // Latency = 3, Repeat rate = 1
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0]>]>,
-  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [CFX_0]>],
-                              [6, 2], // Latency = 2, Repeat rate = 4
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [LSU_0], 0>]>,
-  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<5, [CFX_0]>],
-                              [9, 2], // Latency = 5, Repeat rate = 5
-                              [GPR_Bypass, CR_Bypass]>,
-  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [SFX0]>],
-                              [8, 2], // Latency = 4, Repeat rate = 4
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [CFX_0]>],
-                              [5], // Latency = 1, Repeat rate = 1
-                              [GPR_Bypass]>,
-  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<4, [CFX_0]>],
-                              [8, 2], // Latency = 4, Repeat rate = 4
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [SFX0, SFX1]>],
-                              [5], // Latency = 1, Repeat rate = 1
-                              [GPR_Bypass]>,
-  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [FPU_0]>],
-                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [FPU_0]>],
-                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
-  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [FPU_0]>],
-                              [11, 2, 2], // Latency = 7, Repeat rate = 1
-                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<31, [FPU_0]>],
-                              [39, 2, 2], // Latency = 35, Repeat rate = 31
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<16, [FPU_0]>],
-                              [24, 2, 2], // Latency = 20, Repeat rate = 16 
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<1, [FPU_0]>],
-                              [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
-                               InstrStage<2, [FPU_0]>],
-                              [12, 2], // Latency = 8, Repeat rate = 2
-                              [FPR_Bypass, FPR_Bypass]>
+  [E5500_DIS0, E5500_DIS1, E5500_SFX0, E5500_SFX1, E5500_BU,
+   E5500_CFX_DivBypass, E5500_CFX_0, E5500_CFX_1,
+   E5500_LSU_0, E5500_FPU_0],
+  [E5500_CR_Bypass, E5500_GPR_Bypass, E5500_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 1 or 2
+                                 [E5500_CR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<26, [E5500_CFX_DivBypass]>],
+                                 [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<16, [E5500_CFX_DivBypass]>],
+                                 [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<7, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 7
+                                 [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IIC_IntMulHD,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<2, [E5500_CFX_1]>],
+                                 [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<1, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<1, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<2, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotateD,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotateDI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0]>],
+                                 [6, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2], // Latency = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_CR_Bypass,
+                                  E5500_CR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2], // Latency = 1
+                                 [E5500_CR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_CR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLD,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLDARX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<3, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLDU,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLDUX,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_LSU_0]>],
+                                 [8, 2], // Latency = r+3, Repeat rate = r+3
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<3, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 3
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_CFX_0]>],
+                                 [6, 2], // Latency = 2, Repeat rate = 4
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<5, [E5500_CFX_0]>],
+                                 [9, 2], // Latency = 5, Repeat rate = 5
+                                 [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<5, [E5500_CFX_0]>],
+                                 [9, 2], // Latency = 5, Repeat rate = 5
+                                 [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_SFX0]>],
+                                 [8, 2], // Latency = 4, Repeat rate = 4
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0]>],
+                                 [5], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_CFX_0]>],
+                                 [8, 2], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_CR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<31, [E5500_FPU_0]>],
+                                 [39, 2, 2], // Latency = 35, Repeat rate = 31
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<16, [E5500_FPU_0]>],
+                                 [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_FPU_0]>],
+                                 [12, 2], // Latency = 8, Repeat rate = 2
+                                 [E5500_FPR_Bypass, E5500_FPR_Bypass]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index 72a0a39..21efd8f 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -11,61 +11,70 @@
 //
 //===----------------------------------------------------------------------===//
 
+def G3_BPU    : FuncUnit; // Branch unit
+def G3_SLU    : FuncUnit; // Store/load unit
+def G3_SRU    : FuncUnit; // special register unit
+def G3_IU1    : FuncUnit; // integer unit 1 (simple)
+def G3_IU2    : FuncUnit; // integer unit 2 (complex)
+def G3_FPU1   : FuncUnit; // floating point unit 1
 
 def G3Itineraries : ProcessorItineraries<
-  [IU1, IU2, FPU1, BPU, SRU, SLU], [], [
-  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
-  InstrItinData<IntMFFS     , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
-  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
-  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
-  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
-  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
-  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
-  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
-  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
-  InstrItinData<LdStDCBA    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,  
-  InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,  
-  InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>,  
-  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
-  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
-  InstrItinData<LdStSync    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
-  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SRU]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
-  InstrItinData<SprMFTB     , [InstrStage<3, [SRU]>]>,
-  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
-  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
-  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
-  InstrItinData<FPFused     , [InstrStage<2, [FPU1]>]>,
-  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>
+  [G3_IU1, G3_IU2, G3_FPU1, G3_BPU, G3_SRU, G3_SLU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<19, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<3, [G3_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<6, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G3_IU1]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G3_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_LdStDCBA    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStStore   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<34, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<8, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<31, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<17, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<2, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<10, [G3_FPU1]>]>
 ]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index fc9120d..340773e 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -11,71 +11,86 @@
 //
 //===----------------------------------------------------------------------===//
 
+def G4_BPU    : FuncUnit; // Branch unit
+def G4_SLU    : FuncUnit; // Store/load unit
+def G4_SRU    : FuncUnit; // special register unit
+def G4_IU1    : FuncUnit; // integer unit 1 (simple)
+def G4_IU2    : FuncUnit; // integer unit 2 (complex)
+def G4_FPU1   : FuncUnit; // floating point unit 1
+def G4_VPU    : FuncUnit; // vector permutation unit
+def G4_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G4_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G4_VFPU   : FuncUnit; // vector floating point unit
+
 def G4Itineraries : ProcessorItineraries<
-  [IU1, IU2, SLU, SRU, BPU, FPU1, VIU1, VIU2, VPU, VFPU], [], [
-  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
-  InstrItinData<IntMFFS     , [InstrStage<3, [FPU1]>]>,
-  InstrItinData<IntMFVSCR   , [InstrStage<1, [VIU1]>]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
-  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
-  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
-  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
-  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
-  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
-  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
-  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
-  InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>, 
-  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
-  InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTVEBX  , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<5, [SLU]>]>,
-  InstrItinData<LdStSync    , [InstrStage<8, [SLU]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
-  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<8, [SRU]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
-  InstrItinData<SprMFTB     , [InstrStage<1, [SRU]>]>,
-  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
-  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
-  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
-  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
-  InstrItinData<FPFused     , [InstrStage<1, [FPU1]>]>,
-  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>,
-  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
-  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
-  InstrItinData<VecFPCompare, [InstrStage<1, [VIU1]>]>,
-  InstrItinData<VecComplex  , [InstrStage<3, [VIU2]>]>,
-  InstrItinData<VecPerm     , [InstrStage<1, [VPU]>]>,
-  InstrItinData<VecFPRound  , [InstrStage<4, [VFPU]>]>,
-  InstrItinData<VecVSL      , [InstrStage<1, [VIU1]>]>,
-  InstrItinData<VecVSR      , [InstrStage<1, [VIU1]>]>
+  [G4_IU1, G4_IU2, G4_SLU, G4_SRU, G4_BPU, G4_FPU1,
+   G4_VIU1, G4_VIU2, G4_VPU, G4_VFPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<19, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<3, [G4_FPU1]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<3, [G4_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<6, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G4_IU1]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G4_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<2, [G4_SLU]>]>, 
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<2, [G4_SLU]>]>, 
+  InstrItinData<IIC_LdStLMW     , [InstrStage<34, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<5, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<8, [G4_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<8, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<31, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<17, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<10, [G4_FPU1]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<4, [G4_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<3, [G4_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [G4_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<4, [G4_VFPU]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [G4_VIU1]>]>
 ]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index a4e82ce..1d9f13f 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -11,78 +11,102 @@
 //
 //===----------------------------------------------------------------------===//
 
-def IU3    : FuncUnit; // integer unit 3 (7450 simple)
-def IU4    : FuncUnit; // integer unit 4 (7450 simple)
+def G4P_BPU    : FuncUnit; // Branch unit
+def G4P_SLU    : FuncUnit; // Store/load unit
+def G4P_SRU    : FuncUnit; // special register unit
+def G4P_IU1    : FuncUnit; // integer unit 1 (simple)
+def G4P_IU2    : FuncUnit; // integer unit 2 (complex)
+def G4P_IU3    : FuncUnit; // integer unit 3 (simple)
+def G4P_IU4    : FuncUnit; // integer unit 4 (simple)
+def G4P_FPU1   : FuncUnit; // floating point unit 1
+def G4P_VPU    : FuncUnit; // vector permutation unit
+def G4P_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G4P_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G4P_VFPU   : FuncUnit; // vector floating point unit
 
 def G4PlusItineraries : ProcessorItineraries<
-  [IU1, IU2, IU3, IU4, BPU, SLU, FPU1, VFPU, VIU1, VIU2, VPU], [], [
-  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
-  InstrItinData<IntMFFS     , [InstrStage<5, [FPU1]>]>,
-  InstrItinData<IntMFVSCR   , [InstrStage<2, [VFPU]>]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<5, [FPU1]>]>,
-  InstrItinData<IntMulHW    , [InstrStage<4, [IU2]>]>,
-  InstrItinData<IntMulHWU   , [InstrStage<4, [IU2]>]>,
-  InstrItinData<IntMulLI    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
-  InstrItinData<BrCR        , [InstrStage<2, [IU2]>]>,
-  InstrItinData<BrMCR       , [InstrStage<2, [IU2]>]>,
-  InstrItinData<BrMCRX      , [InstrStage<2, [IU2]>]>,
-  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<LdStSTFD    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
-  InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
-  InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLHAU    , [InstrStage<3, [SLU]>]>,  
-  InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
-  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,  
-  InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<4, [IU2]>]>,
-  InstrItinData<SprMTMSR    , [InstrStage<2, [IU2]>]>,
-  InstrItinData<SprMTSR     , [InstrStage<2, [IU2]>]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
-  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<SprMFSPR    , [InstrStage<4, [IU2]>]>,
-  InstrItinData<SprMFTB     , [InstrStage<5, [IU2]>]>,
-  InstrItinData<SprMTSPR    , [InstrStage<2, [IU2]>]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<2, [IU2]>]>,
-  InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
-  InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
-  InstrItinData<FPAddSub    , [InstrStage<5, [FPU1]>]>,  
-  InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
-  InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
-  InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
-  InstrItinData<FPFused     , [InstrStage<5, [FPU1]>]>,
-  InstrItinData<FPRes       , [InstrStage<14, [FPU1]>]>,
-  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
-  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
-  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
-  InstrItinData<VecComplex  , [InstrStage<4, [VIU2]>]>,
-  InstrItinData<VecPerm     , [InstrStage<2, [VPU]>]>,
-  InstrItinData<VecFPRound  , [InstrStage<4, [VIU1]>]>,
-  InstrItinData<VecVSL      , [InstrStage<2, [VPU]>]>,
-  InstrItinData<VecVSR      , [InstrStage<2, [VPU]>]>
+  [G4P_IU1, G4P_IU2, G4P_IU3, G4P_IU4, G4P_BPU, G4P_SLU, G4P_FPU1,
+   G4P_VFPU, G4P_VIU1, G4P_VIU2, G4P_VPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<23, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<2, [G4P_VFPU]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<2, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G4P_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<37, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<35, [G4P_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<0, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<5, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<0, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<5, [G4P_FPU1]>]>,  
+  InstrItinData<IIC_FPCompare   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<35, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<21, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<14, [G4P_FPU1]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [G4P_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<4, [G4P_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G4P_VFPU]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<4, [G4P_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<2, [G4P_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<4, [G4P_VIU1]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<2, [G4P_VPU]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<2, [G4P_VPU]>]>
 ]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index c64998d..a3b73ab 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -11,90 +11,110 @@
 //
 //===----------------------------------------------------------------------===//
 
+def G5_BPU    : FuncUnit; // Branch unit
+def G5_SLU    : FuncUnit; // Store/load unit
+def G5_SRU    : FuncUnit; // special register unit
+def G5_IU1    : FuncUnit; // integer unit 1 (simple)
+def G5_IU2    : FuncUnit; // integer unit 2 (complex)
+def G5_FPU1   : FuncUnit; // floating point unit 1
+def G5_FPU2   : FuncUnit; // floating point unit 2
+def G5_VPU    : FuncUnit; // vector permutation unit
+def G5_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G5_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G5_VFPU   : FuncUnit; // vector floating point unit
+
 def G5Itineraries : ProcessorItineraries<
-  [IU1, IU2, SLU, BPU, FPU1, FPU2, VFPU, VIU1, VIU2, VPU], [], [
-  InstrItinData<IntSimple   , [InstrStage<2, [IU1, IU2]>]>,
-  InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
-  InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
-  InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
-  InstrItinData<IntDivW     , [InstrStage<36, [IU1]>]>,
-  InstrItinData<IntMFFS     , [InstrStage<6, [IU2]>]>,
-  InstrItinData<IntMFVSCR   , [InstrStage<1, [VFPU]>]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<6, [FPU1, FPU2]>]>,
-  InstrItinData<IntMulHD    , [InstrStage<7, [IU1, IU2]>]>,
-  InstrItinData<IntMulHW    , [InstrStage<5, [IU1, IU2]>]>,
-  InstrItinData<IntMulHWU   , [InstrStage<5, [IU1, IU2]>]>,
-  InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
-  InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
-  InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
-  InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>,  
-  InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
-  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
-  InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<IntTrapW    , [InstrStage<1, [IU1, IU2]>]>,
-  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
-  InstrItinData<BrCR        , [InstrStage<4, [BPU]>]>,
-  InstrItinData<BrMCR       , [InstrStage<2, [BPU]>]>,
-  InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
-  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,  
-  InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,  
-  InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
-  InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
-  InstrItinData<LdStSTFD    , [InstrStage<4, [SLU]>]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<4, [SLU]>]>,  
-  InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLDU     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
-  InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
-  InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
-  InstrItinData<LdStLHAU    , [InstrStage<5, [SLU]>]>,  
-  InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
-  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
-  InstrItinData<LdStLWARX   , [InstrStage<11, [SLU]>]>,
-  InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
-  InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
-  InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
-  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<40, [SLU]>]>, // needs work
-  InstrItinData<SprMFSR     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<SprMTMSR    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<SprMTSR     , [InstrStage<3, [SLU]>]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
-  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<SprMFSPR    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<SprMFTB     , [InstrStage<10, [IU2]>]>,
-  InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
-  InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
-  InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
-  InstrItinData<FPAddSub    , [InstrStage<6, [FPU1, FPU2]>]>,
-  InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
-  InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
-  InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
-  InstrItinData<FPFused     , [InstrStage<6, [FPU1, FPU2]>]>,
-  InstrItinData<FPRes       , [InstrStage<6, [FPU1, FPU2]>]>,
-  InstrItinData<FPSqrt      , [InstrStage<40, [FPU1, FPU2]>]>,
-  InstrItinData<VecGeneral  , [InstrStage<2, [VIU1]>]>,
-  InstrItinData<VecFP       , [InstrStage<8, [VFPU]>]>,
-  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
-  InstrItinData<VecComplex  , [InstrStage<5, [VIU2]>]>,
-  InstrItinData<VecPerm     , [InstrStage<3, [VPU]>]>,
-  InstrItinData<VecFPRound  , [InstrStage<8, [VFPU]>]>,
-  InstrItinData<VecVSL      , [InstrStage<2, [VIU1]>]>,
-  InstrItinData<VecVSR      , [InstrStage<3, [VPU]>]>
+  [G5_IU1, G5_IU2, G5_SLU, G5_BPU, G5_FPU1, G5_FPU2,
+   G5_VFPU, G5_VIU1, G5_VIU2, G5_VPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<3, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<68, [G5_IU1]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<36, [G5_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<6, [G5_IU2]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<1, [G5_VFPU]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_IntMulHD    , [InstrStage<7, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRFID     , [InstrStage<1, [G5_IU2]>]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>,  
+  InstrItinData<IIC_IntRotate   , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G5_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<4, [G5_BPU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<2, [G5_BPU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<3, [G5_BPU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStStore   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStDSS     , [InstrStage<10, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<40, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<4, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<4, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLD      , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDARX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<5, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<5, [G5_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<64, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSLBIA   , [InstrStage<40, [G5_SLU]>]>, // needs work
+  InstrItinData<IIC_LdStSLBIE   , [InstrStage<2, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<35, [G5_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<40, [G5_SLU]>]>, // needs work
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<2, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<2, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<3, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<10, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<8, [G5_IU2]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<1, [G5_IU2]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<8, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<2, [G5_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<8, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<5, [G5_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<3, [G5_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<8, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<2, [G5_VIU1]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<3, [G5_VPU]>]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
-// e5500 machine model for scheduling and other instruction cost heuristics.
+// G5 machine model for scheduling and other instruction cost heuristics.
 
 def G5Model : SchedMachineModel {
   let IssueWidth = 4;  // 4 (non-branch) instructions are dispatched per cycle.
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
new file mode 100644
index 0000000..d3e4269
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -0,0 +1,385 @@
+//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// IBM POWER7 multicore server processor
+// B. Sinharoy, et al.
+// IBM J. Res. & Dev. (55) 3. May/June 2011.
+
+// Scheduling for the P7 involves tracking two types of resources:
+//  1. The dispatch bundle slots
+//  2. The functional unit resources
+
+// Dispatch units:
+def P7_DU1    : FuncUnit;
+def P7_DU2    : FuncUnit;
+def P7_DU3    : FuncUnit;
+def P7_DU4    : FuncUnit;
+def P7_DU5    : FuncUnit;
+def P7_DU6    : FuncUnit;
+
+def P7_LS1    : FuncUnit; // Load/Store pipeline 1
+def P7_LS2    : FuncUnit; // Load/Store pipeline 2
+
+def P7_FX1    : FuncUnit; // FX pipeline 1
+def P7_FX2    : FuncUnit; // FX pipeline 2
+
+// VS pipeline 1 (vector integer ops. always here)
+def P7_VS1    : FuncUnit; // VS pipeline 1
+// VS pipeline 2 (128-bit stores and perms. here)
+def P7_VS2    : FuncUnit; // VS pipeline 2
+
+def P7_CRU    : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P7_BRU    : FuncUnit; // BR unit
+
+// Notes:
+// Each LSU pipeline can also execute FX add and logical instructions.
+// Each LSU pipeline can complete a load or store in one cycle.
+//
+// Each store is broken into two parts, AGEN goes to the LSU while a
+// "data steering" op. goes to the FXU or VSU.
+//
+// FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
+// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
+//
+// Frequent FX ops. take only one cycle and results can be used again in the
+// next cycle (there is a self-bypass). Getting results from the other FX
+// pipeline takes an additional cycle.
+//
+// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
+// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
+// Dispatch of an instruction to VS1 that uses four single prec. inputs
+// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
+// floating point instruction.
+//
+// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
+// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
+// (unlike on the POWER6).
+//
+// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
+// share the same write-back, and have a 5-cycle latency difference, so the
+// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
+// op. has been dispatched to VS1.
+//
+// Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
+//
+// Instruction dispatch groups have (at most) four non-branch instructions, and
+// two branches. Unlike on the POWER4/5, a branch does not automatically
+// end the dispatch group, but a second branch must be the last in the group.
+
+def P7Itineraries : ProcessorItineraries<
+  [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
+   P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2,
+                                                  P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  // FIXME: Add record-form itinerary data.
+  InstrItinData<IIC_IntDivW     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<36, [P7_FX1, P7_FX2]>],
+                                  [36, 1, 1]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<68, [P7_FX1, P7_FX2]>],
+                                  [68, 1, 1]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLD      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLWARX,    [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLDARX,    [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLMW     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_VS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_CRU]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1]>, // mtcr
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [6, 1]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_FX1]>],
+                                  [4, 1]>, // mtctr
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [8, 1, 1]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [33, 1, 1]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [27, 1, 1]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [44, 1, 1]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [32, 1, 1]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1, 1]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [7, 1, 1]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [P7_DU1, P7_DU2], 0>,
+                                   InstrStage<1, [P7_VS2]>],
+                                  [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P7 machine model for scheduling and other instruction cost heuristics.
+
+def P7Model : SchedMachineModel {
+  let IssueWidth = 6;  // 4 (non-branch) instructions are dispatched per cycle.
+                       // Note that the dispatch bundle size is 6 (including
+                       // branches), but the total internal issue bandwidth per
+                       // cycle (from all queues) is 8.
+
+  let MinLatency = 0;  // Out-of-order dispatch.
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  let Itineraries = P7Itineraries;
+}
+
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 7231ab1..b07abe4 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -17,8 +17,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -31,12 +31,24 @@
 using namespace llvm;
 
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool is64Bit)
+                           const std::string &FS, bool is64Bit,
+                           CodeGenOpt::Level OptLevel)
   : PPCGenSubtargetInfo(TT, CPU, FS)
   , IsPPC64(is64Bit)
   , TargetTriple(TT) {
   initializeEnvironment();
-  resetSubtargetFeatures(CPU, FS);
+
+  std::string FullFS = FS;
+
+  // At -O2 and above, track CR bits as individual registers.
+  if (OptLevel >= CodeGenOpt::Default) {
+    if (!FullFS.empty())
+      FullFS = "+crbits," + FullFS;
+    else
+      FullFS = "+crbits";
+  }
+
+  resetSubtargetFeatures(CPU, FullFS);
 }
 
 /// SetJITMode - This is called to inform the subtarget info that we are
@@ -73,8 +85,10 @@ void PPCSubtarget::initializeEnvironment() {
   HasMFOCRF = false;
   Has64BitSupport = false;
   Use64BitRegs = false;
+  UseCRBits = false;
   HasAltivec = false;
   HasQPX = false;
+  HasVSX = false;
   HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
@@ -179,7 +193,7 @@ bool PPCSubtarget::enablePostRAScheduler(
   return OptLevel >= CodeGenOpt::Default;
 }
 
-// Embedded cores need aggressive scheduling.
+// Embedded cores need aggressive scheduling (and some others also benefit).
 static bool needsAggressiveScheduling(unsigned Directive) {
   switch (Directive) {
   default: return false;
@@ -187,6 +201,7 @@ static bool needsAggressiveScheduling(unsigned Directive) {
   case PPC::DIR_A2:
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
+  case PPC::DIR_PWR7:
     return true;
   }
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index c863a6e..87e012e 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -73,6 +73,7 @@ protected:
   bool HasMFOCRF;
   bool Has64BitSupport;
   bool Use64BitRegs;
+  bool UseCRBits;
   bool IsPPC64;
   bool HasAltivec;
   bool HasQPX;
@@ -103,7 +104,8 @@ public:
   /// of the specified triple.
   ///
   PPCSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, bool is64Bit);
+               const std::string &FS, bool is64Bit,
+               CodeGenOpt::Level OptLevel);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -126,22 +128,6 @@ public:
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
-  /// getDataLayoutString - Return the pointer size and type alignment
-  /// properties of this subtarget.
-  const char *getDataLayoutString() const {
-    // Note, the alignment values for f64 and i64 on ppc64 in Darwin
-    // documentation are wrong; these are correct (i.e. "what gcc does").
-    if (isPPC64() && isSVR4ABI()) {
-      if (TargetTriple.getOS() == llvm::Triple::FreeBSD)
-        return "E-p:64:64-f64:64:64-i64:64:64-v128:128:128-n32:64";
-      else
-        return "E-p:64:64-f64:64:64-i64:64:64-f128:128:128-v128:128:128-n32:64";
-    }
-
-    return isPPC64() ? "E-p:64:64-f64:64:64-i64:64:64-f128:64:128-n32:64"
-                     : "E-p:32:32-f64:64:64-i64:64:64-f128:64:128-n32";
-  }
-
   /// \brief Reset the features for the PowerPC target.
   virtual void resetSubtargetFeatures(const MachineFunction *MF);
 private:
@@ -162,6 +148,10 @@ public:
   /// has64BitSupport() returns true.
   bool use64BitRegs() const { return Use64BitRegs; }
 
+  /// useCRBits - Return true if we should store and manipulate i1 values in
+  /// the individual condition register bits.
+  bool useCRBits() const { return UseCRBits; }
+
   /// hasLazyResolverStub - Return true if accesses to the specified global have
   /// to go through a dyld lazy resolution stub.  This means that an extra load
   /// is required to get the address of the global.
@@ -188,6 +178,7 @@ public:
   bool hasFPCVT() const { return HasFPCVT; }
   bool hasAltivec() const { return HasAltivec; }
   bool hasQPX() const { return HasQPX; }
+  bool hasVSX() const { return HasVSX; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
   bool hasPOPCNTD() const { return HasPOPCNTD; }
@@ -200,8 +191,6 @@ public:
 
   /// isDarwin - True if this is any darwin platform.
   bool isDarwin() const { return TargetTriple.isMacOSX(); }
-  /// isBGP - True if this is a BG/P platform.
-  bool isBGP() const { return TargetTriple.getVendor() == Triple::BGP; }
   /// isBGQ - True if this is a BG/Q platform.
   bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 9acefe5..e7438f3 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -26,6 +26,10 @@ static cl::
 opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
                         cl::desc("Disable CTR loops for PPC"));
 
+static cl::opt<bool>
+VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
+  cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
+
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
@@ -33,6 +37,41 @@ extern "C" void LLVMInitializePowerPCTarget() {
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
 }
 
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const PPCSubtarget &ST) {
+  const Triple &T = ST.getTargetTriple();
+
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC64LE is little endian.
+  if (ST.isLittleEndian())
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += DataLayout::getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  if (ST.isPPC64() || ST.isSVR4ABI())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (ST.isPPC64())
+    Ret += "-n32:64";
+  else
+    Ret += "-n32";
+
+  return Ret;
+}
+
 PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
@@ -40,15 +79,11 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
                                    CodeGenOpt::Level OL,
                                    bool is64Bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, is64Bit),
-    DL(Subtarget.getDataLayoutString()), InstrInfo(*this),
+    Subtarget(TT, CPU, FS, is64Bit, OL),
+    DL(getDataLayoutString(Subtarget)), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
-
-  // The binutils for the BG/P are too old for CFI.
-  if (Subtarget.isBGP())
-    setMCUseCFI(false);
   initAsmInfo();
 }
 
@@ -95,6 +130,7 @@ public:
   virtual bool addPreISel();
   virtual bool addILPOpts();
   virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
   virtual bool addPreSched2();
   virtual bool addPreEmitPass();
 };
@@ -129,10 +165,26 @@ bool PPCPassConfig::addInstSelector() {
     addPass(createPPCCTRLoopsVerify());
 #endif
 
+  if (getPPCSubtarget().hasVSX())
+    addPass(createPPCVSXCopyPass());
+
+  return false;
+}
+
+bool PPCPassConfig::addPreRegAlloc() {
+  if (getPPCSubtarget().hasVSX()) {
+    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+               &PPCVSXFMAMutateID);
+  }
+
   return false;
 }
 
 bool PPCPassConfig::addPreSched2() {
+  if (getPPCSubtarget().hasVSX())
+    addPass(createPPCVSXCopyCleanupPass());
+
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
 
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index ec1e606..2903cc1 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetObjectFile.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
@@ -22,16 +22,9 @@ Initialize(MCContext &Ctx, const TargetMachine &TM) {
   InitializeELF(TM.Options.UseInitArray);
 }
 
-const MCSection * PPC64LinuxTargetObjectFile::
-SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                       Mangler *Mang, const TargetMachine &TM) const {
-
-  const MCSection *DefaultSection = 
-    TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
-
-  if (DefaultSection != ReadOnlySection)
-    return DefaultSection;
-
+const MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
   // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI
   // when we have a constant that contains global relocations.  This is
   // necessary because of this ABI's handling of pointers to functions in
@@ -46,14 +39,17 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   // linker, so we must use DataRelROSection instead of ReadOnlySection.
   // For more information, see the description of ELIMINATE_COPY_RELOCS in
   // GNU ld.
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (Kind.isReadOnly()) {
+    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
 
-  if (GVar && GVar->isConstant() &&
-      (GVar->getInitializer()->getRelocationInfo() ==
-       Constant::GlobalRelocations))
-    return DataRelROSection;
+    if (GVar && GVar->isConstant() &&
+        (GVar->getInitializer()->getRelocationInfo() ==
+         Constant::GlobalRelocations))
+      Kind = SectionKind::getReadOnlyWithRel();
+  }
 
-  return DefaultSection;
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind,
+                                                             Mang, TM);
 }
 
 const MCExpr *PPC64LinuxTargetObjectFile::
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
index 262c522..3e71bbc 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -20,14 +20,14 @@ namespace llvm {
   /// 64-bit PowerPC Linux.
   class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
 
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
-    virtual const MCSection *
-    SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                           Mangler *Mang, const TargetMachine &TM) const;
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                        SectionKind Kind, Mangler &Mang,
+                                        const TargetMachine &TM) const override;
 
     /// \brief Describe a TLS variable address within debug info.
-    virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
 
 }  // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
index e876be1..74b5f45 100644
--- a/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -15,8 +15,10 @@
 namespace llvm {
 class PPCTargetStreamer : public MCTargetStreamer {
 public:
+  PPCTargetStreamer(MCStreamer &S);
   virtual ~PPCTargetStreamer();
   virtual void emitTCEntry(const MCSymbol &S) = 0;
+  virtual void emitMachine(StringRef CPU) = 0;
 };
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 8879630..2f4d5c1 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -19,8 +19,8 @@
 #include "PPCTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -32,7 +32,7 @@ void initializePPCTTIPass(PassRegistry &);
 
 namespace {
 
-class PPCTTI : public ImmutablePass, public TargetTransformInfo {
+class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
   const PPCTargetMachine *TM;
   const PPCSubtarget *ST;
   const PPCTargetLowering *TLI;
@@ -52,15 +52,11 @@ public:
     initializePPCTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() {
+  virtual void initializePass() override {
     pushTTIStack(this);
   }
 
-  virtual void finalizePass() {
-    popTTIStack();
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -68,7 +64,7 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;
@@ -76,31 +72,33 @@ public:
 
   /// \name Scalar TTI Implementations
   /// @{
-  virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
-  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+  virtual PopcntSupportKind
+  getPopcntSupport(unsigned TyWidth) const override;
+  virtual void getUnrollingPreferences(
+    Loop *L, UnrollingPreferences &UP) const override;
 
   /// @}
 
   /// \name Vector TTI Implementations
   /// @{
 
-  virtual unsigned getNumberOfRegisters(bool Vector) const;
-  virtual unsigned getRegisterBitWidth(bool Vector) const;
-  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getNumberOfRegisters(bool Vector) const override;
+  virtual unsigned getRegisterBitWidth(bool Vector) const override;
+  virtual unsigned getMaximumUnrollFactor() const override;
   virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                           OperandValueKind,
-                                          OperandValueKind) const;
+                                          OperandValueKind) const override;
   virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                                  int Index, Type *SubTp) const;
+                                  int Index, Type *SubTp) const override;
   virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const;
+                                    Type *Src) const override;
   virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const;
+                                      Type *CondTy) const override;
   virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const;
+                                      unsigned Index) const override;
   virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
                                    unsigned Alignment,
-                                   unsigned AddressSpace) const;
+                                   unsigned AddressSpace) const override;
 
   /// @}
 };
@@ -141,7 +139,7 @@ void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasAltivec())
     return 0;
-  return 32;
+  return ST->hasVSX() ? 64 : 32;
 }
 
 unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
@@ -210,6 +208,14 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
+    // Double-precision scalars are already located in index #0.
+    if (Index == 0)
+      return 0;
+
+    return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+  }
+
   // Estimated cost of a load-hit-store delay.  This was obtained
   // experimentally as a minimum needed to prevent unprofitable
   // vectorization for the paq8p benchmark.  It may need to be
@@ -235,14 +241,16 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
-  // Each load/store unit costs 1.
-  unsigned Cost = LT.first * 1;
+  unsigned Cost =
+    TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+
+  // FIXME: Update this for VSX loads/stores that support unaligned access.
 
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
   unsigned SrcBytes = LT.second.getStoreSize();
   if (SrcBytes && Alignment && Alignment < SrcBytes)
-    Cost *= (SrcBytes/Alignment);
+    Cost += LT.first*(SrcBytes/Alignment-1);
 
   return Cost;
 }
diff --git a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
index fdb8a62..c9548c7 100644
--- a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMPowerPCInfo
   PowerPCTargetInfo.cpp
   )
-
-add_dependencies(LLVMPowerPCInfo PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt b/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt
index f77d85b..4102346 100644
--- a/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/PowerPC/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCInfo
 parent = PowerPC
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = PowerPC
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 025b28e..3e1848b 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -28,11 +28,11 @@ class TargetMachine;
 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
 FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
-FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers();
 FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
 FunctionPass *createR600Packetizer(TargetMachine &tm);
 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
-FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass();
 
 // SI Passes
 FunctionPass *createSITypeRewriter();
@@ -68,7 +68,7 @@ namespace ShaderType {
 /// various memory regions on the hardware. On the CPU
 /// all of the address spaces point to the same memory,
 /// however on the GPU, each address space points to
-/// a seperate piece of memory that is unique from other
+/// a separate piece of memory that is unique from other
 /// memory locations.
 namespace AMDGPUAS {
 enum AddressSpaces {
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 182235b..d1e2cf5 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -63,6 +63,11 @@ def FeatureCaymanISA : SubtargetFeature<"caymanISA",
         "true",
         "Use Cayman ISA">;
 
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+        "CFALUBug",
+        "true",
+        "GPU has CF_ALU bug">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
@@ -72,6 +77,16 @@ class SubtargetFeatureFetchLimit <string Value> :
 def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
 def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
 
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+        "wavefrontsize"#Value,
+        "WavefrontSize",
+        !cast<string>(Value),
+        "The number of threads per wavefront">;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
 class SubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
@@ -87,7 +102,7 @@ def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
         [FeatureFetchLimit16]>;
 
 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16]>;
+        [FeatureFetchLimit16, FeatureWavefrontSize64]>;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
         [Feature64BitPtr, FeatureFP64]>;
@@ -100,19 +115,9 @@ def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// Declare the target which we are implementing
-//===----------------------------------------------------------------------===//
-def AMDGPUAsmWriter : AsmWriter {
-    string AsmWriterClassName = "InstPrinter";
-    int Variant = 0;
-    bit isMCAsmWriter = 1;
-}
-
 def AMDGPU : Target {
   // Pull in Instruction Info:
   let InstructionSet = AMDGPUInstrInfo;
-  let AssemblyWriters = [AMDGPUAsmWriter];
 }
 
 // Include AMDGPU TD files
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 67bdba2..b166c45 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -46,28 +46,26 @@ extern "C" void LLVMInitializeR600AsmPrinter() {
 }
 
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer)
-{
-  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode() &&
-                  ! Streamer.hasRawTextSupport();
+    : AsmPrinter(TM, Streamer) {
+  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
 }
 
-/// We need to override this function so we can avoid
-/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   SetupMachineFunction(MF);
-  if (OutStreamer.hasRawTextSupport()) {
-    OutStreamer.EmitRawText("@" + MF.getName() + ":");
-  }
+
+  OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':'));
 
   MCContext &Context = getObjFileLowering().getContext();
   const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
                                               ELF::SHT_PROGBITS, 0,
                                               SectionKind::getReadOnly());
   OutStreamer.SwitchSection(ConfigSection);
+
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  SIProgramInfo KernelInfo;
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    EmitProgramInfoSI(MF);
+    findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
+    EmitProgramInfoSI(MF, KernelInfo);
   } else {
     EmitProgramInfoR600(MF);
   }
@@ -79,6 +77,26 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
   EmitFunctionBody();
 
+  if (isVerbose()) {
+    const MCSectionELF *CommentSection
+      = Context.getELFSection(".AMDGPU.csdata",
+                              ELF::SHT_PROGBITS, 0,
+                              SectionKind::getReadOnly());
+    OutStreamer.SwitchSection(CommentSection);
+
+    if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      OutStreamer.emitRawComment(" Kernel info:", false);
+      OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
+                                 false);
+      OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
+                                 false);
+    } else {
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      OutStreamer.emitRawComment(
+        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+    }
+  }
+
   if (STM.dumpCode()) {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     MF.dump();
@@ -166,8 +184,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
   }
 }
 
-void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
+                                              unsigned &NumSGPR,
+                                              unsigned &NumVGPR) const {
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -184,16 +203,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
         MachineOperand &MO = MI.getOperand(op_idx);
-        unsigned maxUsed;
         unsigned width = 0;
         bool isSGPR = false;
-        unsigned reg;
-        unsigned hwReg;
+
         if (!MO.isReg()) {
           continue;
         }
-        reg = MO.getReg();
-        if (reg == AMDGPU::VCC) {
+        unsigned reg = MO.getReg();
+        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
+	    reg == AMDGPU::VCC_HI) {
           VCCUsed = true;
           continue;
         }
@@ -240,10 +258,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
           isSGPR = false;
           width = 16;
         } else {
-          assert(!"Unknown register class");
+          llvm_unreachable("Unknown register class");
         }
-        hwReg = RI->getEncodingValue(reg) & 0xff;
-        maxUsed = hwReg + width - 1;
+        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
+        unsigned maxUsed = hwReg + width - 1;
         if (isSGPR) {
           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
         } else {
@@ -252,10 +270,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
       }
     }
   }
-  if (VCCUsed) {
+
+  if (VCCUsed)
     MaxSGPR += 2;
-  }
-  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  NumSGPR = MaxSGPR;
+  NumVGPR = MaxVGPR;
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
+                                        MachineFunction &MF) const {
+  findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
+                                         const SIProgramInfo &KernelInfo) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg;
   switch (MFI->ShaderType) {
   default: // Fall through
@@ -266,7 +298,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
   }
 
   OutStreamer.EmitIntValue(RsrcReg, 4);
-  OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
+  OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
+                           S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
 
   unsigned LDSAlignShift;
   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index 05dc9bb..a2b8337 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -22,6 +22,22 @@
 namespace llvm {
 
 class AMDGPUAsmPrinter : public AsmPrinter {
+private:
+  struct SIProgramInfo {
+    SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
+    unsigned NumSGPR;
+    unsigned NumVGPR;
+  };
+
+  void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const;
+  void findNumUsedRegistersSI(MachineFunction &MF,
+                              unsigned &NumSGPR,
+                              unsigned &NumVGPR) const;
+
+  /// \brief Emit register usage information so that the GPU driver
+  /// can correctly setup the GPU state.
+  void EmitProgramInfoR600(MachineFunction &MF);
+  void EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo);
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
@@ -32,11 +48,6 @@ public:
     return "AMDGPU Assembly Printer";
   }
 
-  /// \brief Emit register usage information so that the GPU driver
-  /// can correctly setup the GPU state.
-  void EmitProgramInfoR600(MachineFunction &MF);
-  void EmitProgramInfoSI(MachineFunction &MF);
-
   /// Implemented in AMDGPUMCInstLower.cpp
   virtual void EmitInstruction(const MachineInstr *MI);
 
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 40f14d2..0325a00 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -74,14 +74,24 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
 int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
                                          int FI) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned Offset = 0;
+  // Start the offset at 2 so we don't overwrite work group information.
+  // XXX: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
   int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
 
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    unsigned Size = MFI->getObjectSize(i);
-    Offset += (Size / (getStackWidth(MF) * 4));
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+    OffsetBytes += MFI->getObjectSize(i);
+    // Each regiter holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
   }
-  return Offset;
+
+  if (FI != -1)
+    OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
 }
 
 const TargetFrameLowering::SpillSlot *
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index a989135..e8c5f5b 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -16,12 +16,12 @@
 #include "AMDGPURegisterInfo.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
-#include "llvm/ADT/ValueMap.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/ValueMap.h"
 #include "llvm/Support/Compiler.h"
 #include <list>
 #include <queue>
@@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   switch (Opc) {
   default: break;
+  // We are selecting i64 ADD here instead of custom lower it during
+  // DAG legalization, so we can fold some i64 ADDs used for address
+  // calculation into the LOAD and STORE instructions.
+  case ISD::ADD: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (N->getValueType(0) != MVT::i64 ||
+        ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    SDLoc DL(N);
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+
+    SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+    SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+
+    SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                         DL, MVT::i32, LHS, Sub0);
+    SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                         DL, MVT::i32, LHS, Sub1);
+
+    SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                         DL, MVT::i32, RHS, Sub0);
+    SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                         DL, MVT::i32, RHS, Sub1);
+
+    SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+
+    SmallVector<SDValue, 8> AddLoArgs;
+    AddLoArgs.push_back(SDValue(Lo0, 0));
+    AddLoArgs.push_back(SDValue(Lo1, 0));
+
+    SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL,
+                                           VTList, AddLoArgs);
+    SDValue Carry = SDValue(AddLo, 1);
+    SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL,
+                                           MVT::i32, SDValue(Hi0, 0),
+                                           SDValue(Hi1, 0), Carry);
+
+    SDValue Args[5] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      SDValue(AddLo,0),
+      Sub0,
+      SDValue(AddHi,0),
+      Sub1,
+    };
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
+  }
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index c4d75ff..183725c 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "AMDILIntrinsicInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -32,8 +33,9 @@ using namespace llvm;
 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo,
                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() / 8, ArgFlags.getOrigAlign());
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
+                                        ArgFlags.getOrigAlign());
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 
   return true;
 }
@@ -43,6 +45,8 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
+  Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
+
   // Initialize target lowering borrowed from AMDIL
   InitAMDILLowering();
 
@@ -59,6 +63,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
   setOperationAction(ISD::FROUND, MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 
   // The hardware supports ROTR, but not ROTL
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -93,10 +98,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+
   // XXX: This can be change to Custom, once ExpandVectorStores can
   // handle 64-bit stores.
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
 
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
+
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
@@ -117,8 +128,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
@@ -133,6 +150,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
 
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+
   setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
 
@@ -168,6 +187,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::XOR,  VT, Expand);
   }
@@ -182,12 +202,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
     setOperationAction(ISD::FMUL, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FSQRT, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
   }
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 }
 
 //===----------------------------------------------------------------------===//
@@ -225,6 +264,43 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
   return VT == MVT::f32;
 }
 
+bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
+  // Truncate is just accessing a subregister.
+  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
+  // Truncate is just accessing a subregister.
+  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
+         (Dest->getPrimitiveSizeInBits() % 32 == 0);
+}
+
+bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
+  const DataLayout *DL = getDataLayout();
+  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
+  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
+
+  return SrcSize == 32 && DestSize == 64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
+  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
+  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
+  // this will enable reducing 64-bit operations the 32-bit, which is always
+  // good.
+  return Src == MVT::i32 && Dest == MVT::i64;
+}
+
+bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
+  // limited number of native 64-bit operations. Shrinking an operation to fit
+  // in a single 32-bit register should always be helpful. As currently used,
+  // this is much less general than the name suggests, and is only used in
+  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
+  // not profitable, and may actually be harmful.
+  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+}
+
 //===---------------------------------------------------------------------===//
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//
@@ -254,8 +330,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   switch (Op.getOpcode()) {
   default:
     Op.getNode()->dump();
-    assert(0 && "Custom lowering code for this"
-        "instruction is not implemented yet!");
+    llvm_unreachable("Custom lowering code for this"
+                     "instruction is not implemented yet!");
     break;
   // AMDIL DAG lowering
   case ISD::SDIV: return LowerSDIV(Op, DAG);
@@ -273,32 +349,124 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   return Op;
 }
 
+void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                              SmallVectorImpl<SDValue> &Results,
+                                              SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::SIGN_EXTEND_INREG:
+    // Different parts of legalization seem to interpret which type of
+    // sign_extend_inreg is the one to check for custom lowering. The extended
+    // from type is what really matters, but some places check for custom
+    // lowering of the result type. This results in trying to use
+    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
+    // nothing here and let the illegal result integer be handled normally.
+    return;
+
+  default:
+    return;
+  }
+}
+
+SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
+                                                       const GlobalValue *GV,
+                                                       const SDValue &InitPtr,
+                                                       SDValue Chain,
+                                                       SelectionDAG &DAG) const {
+  const DataLayout *TD = getTargetMachine().getDataLayout();
+  SDLoc DL(InitPtr);
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
+    EVT VT = EVT::getEVT(CI->getType());
+    PointerType *PtrTy = PointerType::get(CI->getType(), 0);
+    return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
+                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                 TD->getPrefTypeAlignment(CI->getType()));
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+    EVT VT = EVT::getEVT(CFP->getType());
+    PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
+    return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
+                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                 TD->getPrefTypeAlignment(CFP->getType()));
+  } else if (Init->getType()->isAggregateType()) {
+    EVT PtrVT = InitPtr.getValueType();
+    unsigned NumElements = Init->getType()->getArrayNumElements();
+    SmallVector<SDValue, 8> Chains;
+    for (unsigned i = 0; i < NumElements; ++i) {
+      SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
+          Init->getType()->getArrayElementType()), PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+      Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
+                       GV, Ptr, Chain, DAG));
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
+                       Chains.size());
+  } else {
+    Init->dump();
+    llvm_unreachable("Unhandled constant initializer");
+  }
+}
+
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                  SDValue Op,
                                                  SelectionDAG &DAG) const {
 
   const DataLayout *TD = getTargetMachine().getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = G->getGlobal();
 
-  assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS);
-  // XXX: What does the value of G->getOffset() mean?
-  assert(G->getOffset() == 0 &&
+  switch (G->getAddressSpace()) {
+  default: llvm_unreachable("Global Address lowering not implemented for this "
+                            "address space");
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    // XXX: What does the value of G->getOffset() mean?
+    assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
-  const GlobalValue *GV = G->getGlobal();
+    unsigned Offset;
+    if (MFI->LocalMemoryObjects.count(GV) == 0) {
+      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+      Offset = MFI->LDSSize;
+      MFI->LocalMemoryObjects[GV] = Offset;
+      // XXX: Account for alignment?
+      MFI->LDSSize += Size;
+    } else {
+      Offset = MFI->LocalMemoryObjects[GV];
+    }
 
-  unsigned Offset;
-  if (MFI->LocalMemoryObjects.count(GV) == 0) {
-    uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
-    Offset = MFI->LDSSize;
-    MFI->LocalMemoryObjects[GV] = Offset;
-    // XXX: Account for alignment?
-    MFI->LDSSize += Size;
-  } else {
-    Offset = MFI->LocalMemoryObjects[GV];
+    return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+    Type *EltType = GV->getType()->getElementType();
+    unsigned Size = TD->getTypeAllocSize(EltType);
+    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+
+    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+    const Constant *Init = Var->getInitializer();
+    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
+    SDValue InitPtr = DAG.getFrameIndex(FI,
+        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    SmallVector<SDNode*, 8> WorkList;
+
+    for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
+                              E = DAG.getEntryNode()->use_end(); I != E; ++I) {
+      if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
+        continue;
+      WorkList.push_back(*I);
+    }
+    SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
+    for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
+                                           E = WorkList.end(); I != E; ++I) {
+      SmallVector<SDValue, 8> Ops;
+      Ops.push_back(Chain);
+      for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
+        Ops.push_back((*I)->getOperand(i));
+      }
+      DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size());
+    }
+    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
+        getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+  }
   }
-
-  return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
 }
 
 void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
@@ -391,6 +559,30 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case AMDGPUIntrinsic::AMDGPU_umin:
       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
+      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
+      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfi:
+      return DAG.getNode(AMDGPUISD::BFI, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfm:
+      return DAG.getNode(AMDGPUISD::BFM, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2));
+
     case AMDGPUIntrinsic::AMDIL_round_nearest:
       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
   }
@@ -455,7 +647,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETTRUE2:
   case ISD::SETUO:
   case ISD::SETO:
-    assert(0 && "Operation should already be optimised !");
+    llvm_unreachable("Operation should already be optimised!");
   case ISD::SETULE:
   case ISD::SETULT:
   case ISD::SETOLE:
@@ -479,7 +671,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
       return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
   }
   case ISD::SETCC_INVALID:
-    assert(0 && "Invalid setcc condcode !");
+    llvm_unreachable("Invalid setcc condcode!");
   }
   return Op;
 }
@@ -503,8 +695,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
                         MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
                         Load->getAlignment()));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), &Loads[0],
-                     Loads.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(),
+                     Loads.data(), Loads.size());
 }
 
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@@ -513,9 +705,9 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   EVT MemVT = Store->getMemoryVT();
   unsigned MemBits = MemVT.getSizeInBits();
 
-  // Byte stores are really expensive, so if possible, try to pack
-  // 32-bit vector truncatating store into an i32 store.
-  // XXX: We could also handle optimize other vector bitwidths
+  // Byte stores are really expensive, so if possible, try to pack 32-bit vector
+  // truncating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths.
   if (!MemVT.isVector() || MemBits > 32) {
     return SDValue();
   }
@@ -528,17 +720,8 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   unsigned MemEltBits = MemEltVT.getSizeInBits();
   unsigned MemNumElements = MemVT.getVectorNumElements();
   EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
-  SDValue Mask;
-  switch(MemEltBits) {
-  case 8:
-    Mask = DAG.getConstant(0xFF, PackedVT);
-    break;
-  case 16:
-    Mask = DAG.getConstant(0xFFFF, PackedVT);
-    break;
-  default:
-    llvm_unreachable("Cannot lower this vector store");
-  }
+  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT);
+
   SDValue PackedValue;
   for (unsigned i = 0; i < MemNumElements; ++i) {
     EVT ElemVT = VT.getVectorElementType();
@@ -586,18 +769,120 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
 }
 
+SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT VT = Op.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+
+  if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
+    // We can do the extload to 32-bits, and then need to separately extend to
+    // 64-bits.
+
+    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
+                                       Load->getChain(),
+                                       Load->getBasePtr(),
+                                       MemVT,
+                                       Load->getMemOperand());
+    return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
+  }
+
+  // Lower loads constant address space global variable loads
+  if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+      isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) {
+
+    SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
+        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+        DAG.getConstant(2, MVT::i32));
+    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                       Load->getChain(), Ptr,
+                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
+  }
+
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+      ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
+    return SDValue();
+
+
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, MVT::i32));
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, MVT::i32),
+                            Op.getOperand(2));
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, MVT::i32));
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, MVT::i32));
+
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+  EVT MemEltVT = MemVT.getScalarType();
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+  }
+
+  return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
+}
+
 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
   SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
   if (Result.getNode()) {
     return Result;
   }
 
   StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Chain = Store->getChain();
   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
       Store->getValue().getValueType().isVector()) {
     return SplitVectorStore(Op, DAG);
   }
+
+  EVT MemVT = Store->getMemoryVT();
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+      MemVT.bitsLT(MVT::i32)) {
+    unsigned Mask = 0;
+    if (Store->getMemoryVT() == MVT::i8) {
+      Mask = 0xff;
+    } else if (Store->getMemoryVT() == MVT::i16) {
+      Mask = 0xffff;
+    }
+    SDValue BasePtr = Store->getBasePtr();
+    SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+                              DAG.getConstant(2, MVT::i32));
+    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                              Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
+
+    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+                                  DAG.getConstant(0x3, MVT::i32));
+
+    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                   DAG.getConstant(3, MVT::i32));
+
+    SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                    Store->getValue());
+
+    SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                       MaskedValue, ShiftAmt);
+
+    SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
+                                  ShiftAmt);
+    DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                          DAG.getConstant(0xffffffff, MVT::i32));
+    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+    SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+    return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                       Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
+  }
   return SDValue();
 }
 
@@ -726,6 +1011,101 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
 
 }
 
+SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                                      unsigned BitsDiff,
+                                                      SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  SDValue Shift = DAG.getConstant(BitsDiff, VT);
+  // Shift left by 'Shift' bits.
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
+  // Signed shift Right by 'Shift' bits.
+  return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
+}
+
+SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+
+  unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits();
+  unsigned DestBits = ScalarVT.getSizeInBits();
+  unsigned BitsDiff = DestBits - SrcBits;
+
+  if (!Subtarget->hasBFE())
+    return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+
+  SDValue Src = Op.getOperand(0);
+  if (VT.isVector()) {
+    SDLoc DL(Op);
+    // Need to scalarize this, and revisit each of the scalars later.
+    // TODO: Don't scalarize on Evergreen?
+    unsigned NElts = VT.getVectorNumElements();
+    SmallVector<SDValue, 8> Args;
+    ExtractVectorElements(Src, DAG, Args, 0, NElts);
+
+    SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+    for (unsigned I = 0; I < NElts; ++I)
+      Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size());
+  }
+
+  if (SrcBits == 32) {
+    SDLoc DL(Op);
+
+    // If the source is 32-bits, this is really half of a 2-register pair, and
+    // we need to discard the unused half of the pair.
+    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc);
+  }
+
+  unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1;
+
+  // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+  // might not be worth the effort, and will need to expand to shifts when
+  // fixing SGPR copies.
+  if (SrcBits < 32 && DestBits <= 32) {
+    SDLoc DL(Op);
+    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+
+    if (DestBits != 32)
+      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src);
+
+    // FIXME: This should use TargetConstant, but that hits assertions for
+    // Evergreen.
+    SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT,
+                              Op.getOperand(0), // Operand
+                              DAG.getConstant(0, ExtVT), // Offset
+                              DAG.getConstant(SrcBits, ExtVT)); // Width
+
+    // Truncate to the original type if necessary.
+    if (ScalarVT == MVT::i32)
+      return Ext;
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext);
+  }
+
+  // For small types, extend to 32-bits first.
+  if (SrcBits < 32) {
+    SDLoc DL(Op);
+    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+
+    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src);
+    SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32,
+                                DL,
+                                ExtVT,
+                                TruncSrc, // Operand
+                                DAG.getConstant(0, ExtVT), // Offset
+                                DAG.getConstant(SrcBits, ExtVT)); // Width
+
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32);
+  }
+
+  // For everything else, use the standard bitshift expansion.
+  return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -818,7 +1198,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMIN)
   NODE_NAME_CASE(SMIN)
   NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(BFE_U32)
+  NODE_NAME_CASE(BFE_I32)
+  NODE_NAME_CASE(BFI)
+  NODE_NAME_CASE(BFM)
   NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
@@ -833,3 +1218,56 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
 }
+
+static void computeMaskedBitsForMinMax(const SDValue Op0,
+                                       const SDValue Op1,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth) {
+  APInt Op0Zero, Op0One;
+  APInt Op1Zero, Op1One;
+  DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
+  DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
+
+  KnownZero = Op0Zero & Op1Zero;
+  KnownOne = Op0One & Op1One;
+}
+
+void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+  const SDValue Op,
+  APInt &KnownZero,
+  APInt &KnownOne,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  case ISD::INTRINSIC_WO_CHAIN: {
+    // FIXME: The intrinsic should just use the node.
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    case AMDGPUIntrinsic::AMDGPU_imax:
+    case AMDGPUIntrinsic::AMDGPU_umax:
+    case AMDGPUIntrinsic::AMDGPU_imin:
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+                                 KnownZero, KnownOne, DAG, Depth);
+      break;
+    default:
+      break;
+    }
+
+    break;
+  }
+  case AMDGPUISD::SMAX:
+  case AMDGPUISD::UMAX:
+  case AMDGPUISD::SMIN:
+  case AMDGPUISD::UMIN:
+    computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
+                               KnownZero, KnownOne, DAG, Depth);
+    break;
+  default:
+    break;
+  }
+}
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 2dfd3cf..a019616 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -21,13 +21,21 @@
 namespace llvm {
 
 class AMDGPUMachineFunction;
+class AMDGPUSubtarget;
 class MachineRegisterInfo;
 
 class AMDGPUTargetLowering : public TargetLowering {
+protected:
+  const AMDGPUSubtarget *Subtarget;
+
 private:
   void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &Args,
                              unsigned Start, unsigned Count) const;
+  SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
+                                   const SDValue &InitPtr,
+                                   SDValue Chain,
+                                   SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
@@ -54,6 +62,7 @@ protected:
   /// \brief Split a vector load into multiple scalar loads.
   SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
@@ -74,10 +83,18 @@ protected:
 public:
   AMDGPUTargetLowering(TargetMachine &TM);
 
-  virtual bool isFAbsFree(EVT VT) const;
-  virtual bool isFNegFree(EVT VT) const;
-  virtual MVT getVectorIdxTy() const;
-  virtual bool isLoadBitCastBeneficial(EVT, EVT) const LLVM_OVERRIDE;
+  virtual bool isFAbsFree(EVT VT) const override;
+  virtual bool isFNegFree(EVT VT) const override;
+  virtual bool isTruncateFree(EVT Src, EVT Dest) const override;
+  virtual bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  virtual bool isZExtFree(Type *Src, Type *Dest) const override;
+  virtual bool isZExtFree(EVT Src, EVT Dest) const override;
+
+  virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+  virtual MVT getVectorIdxTy() const override;
+  virtual bool isLoadBitCastBeneficial(EVT, EVT) const override;
   virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -90,6 +107,10 @@ public:
   }
 
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  virtual void ReplaceNodeResults(SDNode * N,
+                                  SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) const override;
+
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
@@ -99,9 +120,6 @@ public:
     return N;
   }
 
-// Functions defined in AMDILISelLowering.cpp
-public:
-
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
@@ -109,8 +127,10 @@ public:
                                               APInt &KnownZero,
                                               APInt &KnownOne,
                                               const SelectionDAG &DAG,
-                                              unsigned Depth = 0) const;
+                                              unsigned Depth = 0) const override;
 
+// Functions defined in AMDILISelLowering.cpp
+public:
   virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
                                   const CallInst &I, unsigned Intrinsic) const;
 
@@ -131,6 +151,10 @@ private:
   SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                  unsigned BitsDiff,
+                                  SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
   EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -160,6 +184,10 @@ enum {
   UMIN,
   URECIP,
   DOT4,
+  BFE_U32, // Extract range of bits with zero extension to 32-bits.
+  BFE_I32, // Extract range of bits with sign extension to 32-bits.
+  BFI, // (src0 & src1) | (~src0 & src2)
+  BFM, // Insert a range of bits into a 32-bit word.
   TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 4f7084b..e32dd9f 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -110,7 +110,7 @@ AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                     int FrameIndex,
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const {
-  assert(!"Not Implemented");
+  llvm_unreachable("Not Implemented");
 }
 
 void
@@ -119,22 +119,21 @@ AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                      unsigned DestReg, int FrameIndex,
                                      const TargetRegisterClass *RC,
                                      const TargetRegisterInfo *TRI) const {
-  assert(!"Not Implemented");
+  llvm_unreachable("Not Implemented");
 }
 
 bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
   MachineBasicBlock *MBB = MI->getParent();
-   int OffsetOpIdx =
-       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::addr);
+  int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                               AMDGPU::OpName::addr);
    // addr is a custom operand with multiple MI operands, and only the
    // first MI operand is given a name.
   int RegOpIdx = OffsetOpIdx + 1;
-  int ChanOpIdx =
-      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::chan);
-
+  int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                             AMDGPU::OpName::chan);
   if (isRegisterLoad(*MI)) {
-    int DstOpIdx =
-        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                              AMDGPU::OpName::dst);
     unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
     unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
     unsigned Address = calculateIndirectAddress(RegIndex, Channel);
@@ -147,8 +146,8 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const
                         Address, OffsetReg);
     }
   } else if (isRegisterStore(*MI)) {
-    int ValOpIdx =
-        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val);
+    int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                              AMDGPU::OpName::val);
     AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
     unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
     unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index ce5b58c..426910c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -78,18 +78,18 @@ public:
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const = 0;
 
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index fccede0..69d8059 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -86,3 +86,9 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
 
 def AMDGPUround : SDNode<"ISD::FROUND",
                          SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
+
+def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 3c5375d..505fc81 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -322,7 +322,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
 /* --------------------- */
 
 /* Extract element pattern */
-class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, 
+class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
                        SubRegIndex sub_reg>
   : Pat<
   (sub_type (vector_extract vec_type:$src, sub_idx)),
@@ -337,12 +337,6 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
-class Vector4_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3)
->;
-
 // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
 // can handle COPY instructions.
 // bitconvert pattern
@@ -388,6 +382,11 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
 
 // Bitfield extract patterns
 
+/*
+
+XXX: The BFE pattern is not working correctly because the XForm is not being
+applied.
+
 def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
 def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
                             SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
@@ -397,6 +396,8 @@ class BFEPattern <Instruction BFE> : Pat <
   (BFE $x, $y, $z)
 >;
 
+*/
+
 // rotr pattern
 class ROTRPattern <Instruction BIT_ALIGN> : Pat <
   (rotr i32:$src0, i32:$src1),
@@ -414,6 +415,9 @@ class UMUL24Pattern <Instruction UMUL24> : Pat <
 */
 
 include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
 
 include "SIInstrInfo.td"
 
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 9f975bf..c6521d0 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -50,7 +50,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
+  def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
 }
 
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 0ed598e..2c9909f 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -69,6 +69,13 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   AMDGPUMCInstLower MCInstLowering(OutContext);
 
+#ifdef _DEBUG
+  StringRef Err;
+  if (!TM.getInstrInfo()->verifyInstruction(MI, Err)) {
+    errs() << "Warning: Illegal instruction detected: " << Err << "\n";
+    MI->dump();
+  }
+#endif
   if (MI->isBundle()) {
     const MachineBasicBlock *MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator I = MI;
@@ -80,7 +87,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   } else {
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
-    OutStreamer.EmitInstruction(TmpInst);
+    EmitToStreamer(OutStreamer, TmpInst);
 
     if (DisasmEnabled) {
       // Disassemble instruction/operands to text.
@@ -99,7 +106,8 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
       MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
       MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
-      InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups);
+      InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups,
+                                    TM.getSubtarget<MCSubtargetInfo>());
       CodeStream.flush();
 
       HexLines.resize(HexLines.size() + 1);
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 47617a7..8fbec4e 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -38,7 +38,7 @@ void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                              int SPAdj,
                                              unsigned FIOperandNum,
                                              RegScavenger *RS) const {
-  assert(!"Subroutines not supported yet");
+  llvm_unreachable("Subroutines not supported yet");
 }
 
 unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 061793a..e77ab5e 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -38,6 +38,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   CaymanISA = false;
   EnableIRStructurizer = true;
   EnableIfCvt = true;
+  WavefrontSize = 0;
+  CFALUBug = false;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
 }
@@ -74,13 +76,39 @@ bool
 AMDGPUSubtarget::isIfCvtEnabled() const {
   return EnableIfCvt;
 }
+unsigned
+AMDGPUSubtarget::getWavefrontSize() const {
+  return WavefrontSize;
+}
+unsigned
+AMDGPUSubtarget::getStackEntrySize() const {
+  assert(getGeneration() <= NORTHERN_ISLANDS);
+  switch(getWavefrontSize()) {
+  case 16:
+    return 8;
+  case 32:
+    if (hasCaymanISA())
+      return 4;
+    else
+      return 8;
+  case 64:
+    return 4;
+  default:
+    llvm_unreachable("Illegal wavefront size.");
+  }
+}
+bool
+AMDGPUSubtarget::hasCFAluBug() const {
+  assert(getGeneration() <= NORTHERN_ISLANDS);
+  return CFALUBug;
+}
 bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
 size_t
 AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
-  if (dim > 3) {
+  if (dim > 2) {
     return 1;
   } else {
     return DefaultSize[dim];
@@ -88,33 +116,6 @@ AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
 }
 
 std::string
-AMDGPUSubtarget::getDataLayout() const {
-  std::string DataLayout = std::string(
-   "e"
-   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
-   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
-   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-   "-n32:64"
-  );
-
-  if (hasHWFP64()) {
-    DataLayout.append("-f64:64:64");
-  }
-
-  if (is64bit()) {
-    DataLayout.append("-p:64:64:64");
-  } else {
-    DataLayout.append("-p:32:32:32");
-  }
-
-  if (Gen >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    DataLayout.append("-p3:32:32:32");
-  }
-
-  return DataLayout;
-}
-
-std::string
 AMDGPUSubtarget::getDeviceName() const {
   return DevName;
 }
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 4288d27..8874d14 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -51,6 +51,8 @@ private:
   bool CaymanISA;
   bool EnableIRStructurizer;
   bool EnableIfCvt;
+  unsigned WavefrontSize;
+  bool CFALUBug;
 
   InstrItineraryData InstrItins;
 
@@ -66,8 +68,20 @@ public:
   enum Generation getGeneration() const;
   bool hasHWFP64() const;
   bool hasCaymanISA() const;
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFM() const {
+    return hasBFE();
+  }
+
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
+  unsigned getWavefrontSize() const;
+  unsigned getStackEntrySize() const;
+  bool hasCFAluBug() const;
 
   virtual bool enableMachineScheduler() const {
     return getGeneration() <= NORTHERN_ISLANDS;
@@ -75,7 +89,6 @@ public:
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
-  std::string getDataLayout() const;
   std::string getDeviceName() const;
   virtual size_t getDefaultSize(uint32_t dim) const;
   bool dumpCode() const { return DumpCode; }
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index bc4f5d7..b11fce3 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -21,10 +21,10 @@
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -42,13 +42,27 @@ extern "C" void LLVMInitializeR600Target() {
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, new R600SchedStrategy());
+  return new ScheduleDAGMILive(C, new R600SchedStrategy());
 }
 
 static MachineSchedRegistry
 SchedCustomRegistry("r600", "Run R600's custom scheduler",
                     createR600MachineScheduler);
 
+static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
+  std::string Ret = "e-p:32:32";
+
+  if (ST.is64bit()) {
+    // 32-bit private, local, and region pointers. 64-bit global and constant.
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+  }
+
+  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+         "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+  return Ret;
+}
+
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
     StringRef CPU, StringRef FS,
   TargetOptions Options,
@@ -58,7 +72,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
 :
   LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
   Subtarget(TT, CPU, FS),
-  Layout(Subtarget.getDataLayout()),
+  Layout(computeDataLayout(Subtarget)),
   FrameLowering(TargetFrameLowering::StackGrowsUp,
                 64 * 16 // Maximum stack alignment (long16)
                , 0),
@@ -72,6 +86,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
     InstrInfo.reset(new SIInstrInfo(*this));
     TLInfo.reset(new SITargetLowering(*this));
   }
+  setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
@@ -127,7 +142,7 @@ AMDGPUPassConfig::addPreISel() {
   addPass(createFlattenCFGPass());
   if (ST.IsIRStructurizerEnabled())
     addPass(createStructurizeCFGPass());
-  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+  if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     addPass(createSinkingPass());
     addPass(createSITypeRewriter());
     addPass(createSIAnnotateControlFlowPass());
@@ -150,6 +165,9 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
     addPass(createR600VectorRegMerger(*TM));
   } else {
     addPass(createSIFixSGPRCopiesPass(*TM));
+    // SIFixSGPRCopies can generate a lot of duplicate instructions,
+    // so we need to run MachineCSE afterwards.
+    addPass(&MachineCSEID);
   }
   return false;
 }
@@ -167,7 +185,7 @@ bool AMDGPUPassConfig::addPreSched2() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-    addPass(createR600EmitClauseMarkers(*TM));
+    addPass(createR600EmitClauseMarkers());
   if (ST.isIfCvtEnabled())
     addPass(&IfConverterID);
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
@@ -178,7 +196,7 @@ bool AMDGPUPassConfig::addPreSched2() {
 bool AMDGPUPassConfig::addPreEmitPass() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    addPass(createAMDGPUCFGStructurizerPass(*TM));
+    addPass(createAMDGPUCFGStructurizerPass());
     addPass(createR600ExpandSpecialInstrsPass(*TM));
     addPass(&FinalizeMachineBundlesID);
     addPass(createR600Packetizer(*TM));
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 8db319c..51225eb 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -18,10 +18,12 @@
 #define DEBUG_TYPE "AMDGPUtti"
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -33,7 +35,7 @@ void initializeAMDGPUTTIPass(PassRegistry &);
 
 namespace {
 
-class AMDGPUTTI : public ImmutablePass, public TargetTransformInfo {
+class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
   const AMDGPUTargetMachine *TM;
   const AMDGPUSubtarget *ST;
   const AMDGPUTargetLowering *TLI;
@@ -53,11 +55,9 @@ public:
     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() { pushTTIStack(this); }
+  virtual void initializePass() override { pushTTIStack(this); }
 
-  virtual void finalizePass() { popTTIStack(); }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -65,13 +65,15 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo *)this;
     return this;
   }
 
-  virtual bool hasBranchDivergence() const;
+  virtual bool hasBranchDivergence() const override;
+
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 };
@@ -88,3 +90,32 @@ llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
 }
 
 bool AMDGPUTTI::hasBranchDivergence() const { return true; }
+
+void AMDGPUTTI::getUnrollingPreferences(Loop *L,
+                                        UnrollingPreferences &UP) const {
+  for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+                                                  BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+                                                      I != E; ++I) {
+      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I);
+      if (!GEP)
+        continue;
+      const Value *Ptr = GEP->getPointerOperand();
+      const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
+      if (Alloca) {
+        // We want to do whatever we can to limit the number of alloca
+        // instructions that make it through to the code generator.  allocas
+        // require us to use indirect addressing, which is slow and prone to
+        // compiler bugs.  If this loop does an address calculation on an
+        // alloca ptr, then we want to use a higher than normal loop unroll
+	// threshold.  This will give SROA a better chance to eliminate these
+	// allocas.
+	//
+	// Don't use the maximum allowed value here as it will make some
+	// programs way too big.
+        UP.Threshold = 500;
+      }
+    }
+  }
+}
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 507570f..21ca560 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -13,14 +13,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "R600InstrInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/DominatorInternals.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -30,6 +26,9 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -54,6 +53,10 @@ STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Miscellaneous utility for CFGStructurizer.
@@ -131,13 +134,13 @@ public:
 
   static char ID;
 
-  AMDGPUCFGStructurizer(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm),
-      TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
-      TRI(&TII->getRegisterInfo()) { }
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
 
    const char *getPassName() const {
-    return "AMD IL Control Flow Graph structurizer Pass";
+    return "AMDGPU Control Flow Graph structurizer Pass";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -157,6 +160,8 @@ public:
   bool prepare();
 
   bool runOnMachineFunction(MachineFunction &MF) {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
     OrderedBlks.clear();
     FuncRep = &MF;
@@ -173,7 +178,6 @@ public:
   }
 
 protected:
-  TargetMachine &TM;
   MachineDominatorTree *MDT;
   MachinePostDominatorTree *PDT;
   MachineLoopInfo *MLI;
@@ -220,7 +224,7 @@ protected:
   /// Compute the reversed DFS post order of Blocks
   void orderBlocks(MachineFunction *MF);
 
-  // Function originaly from CFGStructTraits
+  // Function originally from CFGStructTraits
   void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
       DebugLoc DL = DebugLoc());
   MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
@@ -786,7 +790,7 @@ bool AMDGPUCFGStructurizer::prepare() {
 bool AMDGPUCFGStructurizer::run() {
 
   //Assume reducible CFG...
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n";FuncRep->viewCFG(););
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
 
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
@@ -858,8 +862,7 @@ bool AMDGPUCFGStructurizer::run() {
           ContNextScc = false;
           DEBUG(
             dbgs() << "repeat processing SCC" << getSCCNum(MBB)
-                   << "sccNumIter = " << SccNumIter << "\n";
-            FuncRep->viewCFG();
+                   << "sccNumIter = " << SccNumIter << '\n';
           );
         } else {
           // Finish the current scc.
@@ -915,12 +918,10 @@ bool AMDGPUCFGStructurizer::run() {
   BlockInfoMap.clear();
   LLInfoMap.clear();
 
-  DEBUG(
-    FuncRep->viewCFG();
-  );
-
-  if (!Finish)
-    llvm_unreachable("IRREDUCIBL_CF");
+  if (!Finish) {
+    DEBUG(FuncRep->viewCFG());
+    llvm_unreachable("IRREDUCIBLE_CFG");
+  }
 
   return true;
 }
@@ -930,8 +931,8 @@ bool AMDGPUCFGStructurizer::run() {
 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   int SccNum = 0;
   MachineBasicBlock *MBB;
-  for (scc_iterator<MachineFunction *> It = scc_begin(MF), E = scc_end(MF);
-      It != E; ++It, ++SccNum) {
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+       ++It, ++SccNum) {
     std::vector<MachineBasicBlock *> &SccNext = *It;
     for (std::vector<MachineBasicBlock *>::const_iterator
          blockIter = SccNext.begin(), blockEnd = SccNext.end();
@@ -1234,7 +1235,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
 
       numClonedBlock += Num;
       Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
+      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
       Num += ifPatternMatch(HeadMBB);
       assert(Num > 0);
 
@@ -1763,7 +1764,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
   if (MBB->succ_size() != 2)
     return;
   MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *llvm::next(MBB->succ_begin());
+  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
   if (MBB1 != MBB2)
     return;
 
@@ -1899,6 +1900,14 @@ char AMDGPUCFGStructurizer::ID = 0;
 } // end anonymous namespace
 
 
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
-  return new AMDGPUCFGStructurizer(tm);
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
 }
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 970787e..0761ff4 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -94,9 +94,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   for (unsigned int x  = 0; x < NumTypes; ++x) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
 
-    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
-    // We cannot sextinreg, expand to shifts
-    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
     setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -191,14 +188,12 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
   setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
   setOperationAction(ISD::SUBC, MVT::Other, Expand);
   setOperationAction(ISD::ADDE, MVT::Other, Expand);
   setOperationAction(ISD::ADDC, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 
 
   // Use the default implementation.
@@ -248,41 +243,6 @@ AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 // be zero. Op is expected to be a target specific node. Used by DAG
 // combiner.
 
-void
-AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
-    const SDValue Op,
-    APInt &KnownZero,
-    APInt &KnownOne,
-    const SelectionDAG &DAG,
-    unsigned Depth) const {
-  APInt KnownZero2;
-  APInt KnownOne2;
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
-  switch (Op.getOpcode()) {
-    default: break;
-    case ISD::SELECT_CC:
-             DAG.ComputeMaskedBits(
-                 Op.getOperand(1),
-                 KnownZero,
-                 KnownOne,
-                 Depth + 1
-                 );
-             DAG.ComputeMaskedBits(
-                 Op.getOperand(0),
-                 KnownZero2,
-                 KnownOne2
-                 );
-             assert((KnownZero & KnownOne) == 0
-                 && "Bits known to be one AND zero?");
-             assert((KnownZero2 & KnownOne2) == 0
-                 && "Bits known to be one AND zero?");
-             // Only known if known in both the LHS and RHS
-             KnownOne &= KnownOne2;
-             KnownZero &= KnownZero2;
-             break;
-  };
-}
-
 //===----------------------------------------------------------------------===//
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//
@@ -322,36 +282,6 @@ AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
   return DST;
 }
 
-SDValue
-AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Data = Op.getOperand(0);
-  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
-  SDLoc DL(Op);
-  EVT DVT = Data.getValueType();
-  EVT BVT = BaseType->getVT();
-  unsigned baseBits = BVT.getScalarType().getSizeInBits();
-  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
-  unsigned shiftBits = srcBits - baseBits;
-  if (srcBits < 32) {
-    // If the op is less than 32 bits, then it needs to extend to 32bits
-    // so it can properly keep the upper bits valid.
-    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
-    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
-    shiftBits = 32 - baseBits;
-    DVT = IVT;
-  }
-  SDValue Shift = DAG.getConstant(shiftBits, DVT);
-  // Shift left by 'Shift' bits.
-  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
-  // Signed shift Right by 'Shift' bits.
-  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
-  if (srcBits < 32) {
-    // Once the sign extension is done, the op needs to be converted to
-    // its original type.
-    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
-  }
-  return Data;
-}
 EVT
 AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
   int iSize = (size * numEle);
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
index 6ec3559..658deb5 100644
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -68,10 +68,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
 let TargetPrefix = "AMDIL", isTarget = 1 in {
   def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
 
-  def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
-          TernaryIntInt;
-  def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
-          TernaryIntInt;
   def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
           UnaryIntInt;
   def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 9f8f6a8..93a5117 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -50,8 +50,6 @@ add_llvm_target(R600CodeGen
   SITypeRewriter.cpp
   )
 
-add_dependencies(LLVMR600CodeGen AMDGPUCommonTableGen intrinsics_gen)
-
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
new file mode 100644
index 0000000..acd7bde
--- /dev/null
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -0,0 +1,221 @@
+//===-- CaymanInstructions.td - CM Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available only on Cayman
+// family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
+
+//===----------------------------------------------------------------------===//
+// Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCayman] in {
+
+def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
+  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
+>;
+def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
+  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
+>;
+
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+
+// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+>;
+
+  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+    let ADDR = 0;
+    let POP_COUNT = 0;
+    let COUNT = 0;
+  }
+
+
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+
+class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+  let SRC_SEL_Y = 0;
+  let STRUCTURED_READ = 0;
+  let LDS_REQ = 0;
+  let COALESCED_READ = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+} // End isCayman
+
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
new file mode 100644
index 0000000..6430ca6
--- /dev/null
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -0,0 +1,587 @@
+//===-- EvergreenInstructions.td - EG Instruction defs  ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to Evergreen and newer VLIW4/VLIW5 GPUs
+// - Available only on Evergreen family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isEG : Predicate<
+  "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget.hasCaymanISA()"
+>;
+
+def isEGorCayman : Predicate<
+  "Subtarget.getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+  "Subtarget.getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
+}
+
+} // End let Predicates = [isEGorCayman]
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
+def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
+  [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
+  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
+  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+} // End usesCustomInserter = 1
+
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+// BFE_UINT - bit_extract, an optimization for mask and shift
+// Src0 = Input
+// Src1 = Offset
+// Src2 = Width
+//
+// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+//
+// Example Usage:
+// (Offset, Width)
+//
+// (0, 8)  = (Input << 24) >> 24 = (Input &  0xff)       >> 0
+// (8, 8)  = (Input << 16) >> 24 = (Input &  0xffff)     >> 8
+// (16, 8) = (Input <<  8) >> 24 = (Input &  0xffffff)   >> 16
+// (24, 8) = (Input <<  0) >> 24 = (Input &  0xffffffff) >> 24
+def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def BFE_INT_eg : R600_3OP <0x4, "BFE_INT",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+// XXX: This pattern is broken, disabling for now.  See comment in
+// AMDGPUInstructions.td for more info.
+//  def : BFEPattern <BFE_UINT_eg>;
+def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
+  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+defm : BFIPatterns <BFI_INT_eg>;
+
+def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
+  VecALU
+>;
+
+def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
+  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
+>;
+def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : ROTRPattern <BIT_ALIGN_INT_eg>;
+def MULADD_eg : MULADD_Common<0x14>;
+def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def ASHR_eg : ASHR_Common<0x15>;
+def LSHR_eg : LSHR_Common<0x16>;
+def LSHL_eg : LSHL_Common<0x17>;
+def CNDE_eg : CNDE_Common<0x19>;
+def CNDGT_eg : CNDGT_Common<0x1A>;
+def CNDGE_eg : CNDGE_Common<0x1B>;
+def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
+  [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
+>;
+def DOT4_eg : DOT4_Common<0xBE>;
+defm CUBE_eg : CUBE_Common<0xC0>;
+
+let hasSideEffects = 1 in {
+  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
+}
+
+def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
+
+def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+  let Pattern = [];
+  let Itinerary = AnyALU;
+}
+
+def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+  let Pattern = [];
+}
+
+def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+def GROUP_BARRIER : InstR600 <
+    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <0x54> {
+
+  let dst = 0;
+  let dst_rel = 0;
+  let src0 = 0;
+  let src0_rel = 0;
+  let src0_neg = 0;
+  let src0_abs = 0;
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let write = 0;
+  let omod = 0;
+  let clamp = 0;
+  let last = 1;
+  let bank_swizzle = 0;
+  let pred_sel = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
+                 list<dag> pattern = []> :
+
+    InstR600 <outs, ins, asm, pattern, XALU>,
+    R600_ALU_LDS_Word0,
+    R600LDS_Word1 {
+
+  bits<6>  offset = 0;
+  let lds_op = op;
+
+  let Word1{27} = offset{0};
+  let Word1{12} = offset{1};
+  let Word1{28} = offset{2};
+  let Word1{31} = offset{3};
+  let Word0{12} = offset{4};
+  let Word0{25} = offset{5};
+
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+  let HasNativeOperands = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+  lds_op,
+  (outs R600_Reg32:$dst),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+  pattern
+  > {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src2 = 0;
+  let src2_rel = 0;
+
+  let usesCustomInserter = 1;
+  let LDS_1A = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
+  pattern
+  > {
+
+  field string BaseOp;
+
+  let src2 = 0;
+  let src2_rel = 0;
+  let LDS_1A1D = 1;
+}
+
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS <
+  lds_op,
+  (outs),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+  let LDS_1A2D = 1;
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
+>;
+
+// TRUNC is used for the FLT_TO_INT instructions to work around a
+// perceived problem where the rounding modes are applied differently
+// depending on the instruction and the slot they are in.
+// See:
+// https://bugs.freedesktop.org/show_bug.cgi?id=50232
+// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+//
+// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+// which do not need to be truncated since the fp values are 0.0f or 1.0f.
+// We should look into handling these cases separately.
+def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+
+def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
+
+// SHA-256 Patterns
+def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
+def : FROUNDPat <CNDGE_eg>;
+
+def EG_ExportSwz : ExportSwzInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : ExportPattern<EG_ExportSwz, 83>;
+
+def EG_ExportBuf : ExportBufInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "TEX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "VTX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+                              "PUSH @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
+  let ADDR = 0;
+  let COUNT = 0;
+  let POP_COUNT = 0;
+}
+def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_END_EG :  CF_CLAUSE_EG<0, (ins), "CF_END"> {
+  let COUNT = 0;
+  let POP_COUNT = 0;
+  let ADDR = 0;
+  let END_OF_PROGRAM = 1;
+}
+
+} // End Predicates = [isEGorCayman]
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 99e1377..7105879 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -316,6 +316,37 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Msg = SImm16 & 0xF;
+  if (Msg == 2 || Msg == 3) {
+    unsigned Op = (SImm16 >> 4) & 0xF;
+    if (Msg == 3)
+      O << "Gs_done(";
+    else
+      O << "Gs(";
+    if (Op == 0) {
+      O << "nop";
+    } else {
+      unsigned Stream = (SImm16 >> 8) & 0x3;
+      if (Op == 1)
+	O << "cut";
+      else if (Op == 2)
+	O << "emit";
+      else if (Op == 3)
+	O << "emit-cut";
+      O << " stream " << Stream;
+    }
+    O << "), [m0] ";
+  } else if (Msg == 1)
+    O << "interrupt ";
+  else if (Msg == 15)
+    O << "system ";
+  else
+    O << "unknown(" << Msg << ") ";
+}
+
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
   // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 77af942..1d24680 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -32,28 +32,30 @@ public:
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
 private:
-  void printRegOperand(unsigned RegNo, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                  StringRef Asm, StringRef Default = "");
-  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRegOperand(unsigned RegNo, raw_ostream &O);
+  static void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  static void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                         StringRef Asm, StringRef Default = "");
+  static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O);
+  static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/InstPrinter/CMakeLists.txt b/lib/Target/R600/InstPrinter/CMakeLists.txt
index 069c55b..dcd8703 100644
--- a/lib/Target/R600/InstPrinter/CMakeLists.txt
+++ b/lib/Target/R600/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMR600AsmPrinter
   AMDGPUInstPrinter.cpp
   )
-
-add_dependencies(LLVMR600AsmPrinter AMDGPUCommonTableGen)
diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt
index f2a7554..408ed75 100644
--- a/lib/Target/R600/LLVMBuild.txt
+++ b/lib/Target/R600/LLVMBuild.txt
@@ -28,5 +28,5 @@ has_asmprinter = 1
 type = Library
 name = R600CodeGen
 parent = R600
-required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info 
+required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils
 add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 29d0acf..a6bb59f 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -27,11 +27,10 @@ public:
                                         const MCAsmLayout &Layout) {
     //XXX: Implement if necessary.
   }
-  virtual void RecordRelocation(const MCAssembler &Asm,
-                                const MCAsmLayout &Layout,
-                                const MCFragment *Fragment,
-                                const MCFixup &Fixup,
-                                MCValue Target, uint64_t &FixedValue) {
+  void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, bool &IsPCRel,
+                        uint64_t &FixedValue) override {
     assert(!"Not implemented");
   }
 
@@ -46,7 +45,7 @@ public:
 
   virtual unsigned getNumFixupKinds() const { return 0; };
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                          uint64_t Value) const;
+                          uint64_t Value, bool IsPCRel) const;
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                                     const MCRelaxableFragment *DF,
                                     const MCAsmLayout &Layout) const {
@@ -71,7 +70,8 @@ void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
 }
 
 void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                  unsigned DataSize, uint64_t Value) const {
+                                  unsigned DataSize, uint64_t Value,
+                                  bool IsPCRel) const {
 
   uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
   assert(Fixup.getKind() == FK_PCRel_4);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 48fac9f..53b0e85 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -19,9 +19,8 @@ class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AMDGPUELFObjectWriter();
 protected:
-  virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                bool IsPCRel, bool IsRelocWithSymbol,
-                                int64_t Addend) const {
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override {
     llvm_unreachable("Not implemented");
   }
 
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 4a8e1b0..aee9bd1 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -13,7 +13,6 @@
 using namespace llvm;
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   HasSingleParameterDotFile = false;
-  WeakDefDirective = 0;
   //===------------------------------------------------------------------===//
   HasSubsectionsViaSymbols = true;
   HasMachoZeroFillDirective = false;
@@ -22,12 +21,8 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   LinkerRequiresNonEmptyDwarfLines = true;
   MaxInstLength = 16;
   SeparatorString = "\n";
-  CommentColumn = 40;
   CommentString = ";";
   LabelSuffix = ":";
-  GlobalPrefix = "@";
-  PrivateGlobalPrefix = ";.";
-  LinkerPrivateGlobalPrefix = "!";
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
   AssemblerDialect = 0;
@@ -43,10 +38,8 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   GPRel32Directive = 0;
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
-  HasMicrosoftFastStdCallMangling = false;
 
   //===--- Alignment Information ----------------------------------------===//
-  AlignDirective = ".align\t";
   AlignmentIsInBytes = true;
   TextAlignFillValue = 0;
 
@@ -58,7 +51,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
   WeakRefDirective = ".weakref\t";
-  LinkOnceDirective = 0;
   //===--- Dwarf Emission Directives -----------------------------------===//
   HasLEB128 = true;
   SupportsDebugInformation = true;
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index d8cf64a..6a5cd67 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,16 +22,19 @@ namespace llvm {
 
 class MCInst;
 class MCOperand;
+class MCSubtargetInfo;
 
 class AMDGPUMCCodeEmitter : public MCCodeEmitter {
   virtual void anchor();
 public:
 
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
   virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups) const {
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
     return 0;
   }
 };
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index a1bec28..6592b0e 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -86,9 +86,10 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
                                     raw_ostream &_OS,
                                     MCCodeEmitter *_Emitter,
+                                    const MCSubtargetInfo &STI,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, false, false);
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
 }
 
 extern "C" void LLVMInitializeR600TargetMC() {
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
index 98f6925..801c905 100644
--- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
@@ -8,5 +8,3 @@ add_llvm_library(LLVMR600Desc
   R600MCCodeEmitter.cpp
   SIMCCodeEmitter.cpp
   )
-
-add_dependencies(LLVMR600Desc AMDGPUCommonTableGen)
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index dd8df65..286c7d1 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -34,21 +34,21 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
   void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
-  const MCSubtargetInfo &STI;
 
 public:
 
-  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
-                    const MCSubtargetInfo &sti)
-    : MCII(mcii), MRI(mri), STI(sti) { }
+  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
+    : MCII(mcii), MRI(mri) { }
 
   /// \brief Encode the instruction and write it to the OS.
   virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
 
   /// \returns the encoding for an MCOperand.
   virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 private:
 
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
@@ -83,11 +83,12 @@ enum FCInstr {
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                            const MCRegisterInfo &MRI,
                                            const MCSubtargetInfo &STI) {
-  return new R600MCCodeEmitter(MCII, MRI, STI);
+  return new R600MCCodeEmitter(MCII, MRI);
 }
 
 void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   if (MI.getOpcode() == AMDGPU::RETURN ||
     MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
@@ -96,7 +97,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     MI.getOpcode() == AMDGPU::KILL) {
     return;
   } else if (IS_VTX(Desc)) {
-    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
     uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
     if (!(STI.getFeatureBits() & AMDGPU::FeatureCaymanISA)) {
       InstWord2 |= 1 << 19; // Mega-Fetch bit
@@ -120,7 +121,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
         MI.getOperand(8).getImm() & 0x1F
       };
 
-      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups);
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI);
       uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
           SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
           SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
@@ -130,7 +131,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
       Emit(Word2, OS);
       Emit((uint32_t) 0, OS);
   } else {
-    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
     if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
        ((Desc.TSFlags & R600_InstFlag::OP1) ||
          Desc.TSFlags & R600_InstFlag::OP2)) {
@@ -168,7 +169,8 @@ unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
 
 uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                               const MCOperand &MO,
-                                        SmallVectorImpl<MCFixup> &Fixup) const {
+                                        SmallVectorImpl<MCFixup> &Fixup,
+                                        const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
     if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
       return MRI.getEncodingValue(MO.getReg());
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index 5af8320..f42e978 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -48,18 +48,20 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
 
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
-                  const MCSubtargetInfo &sti, MCContext &ctx)
+                  MCContext &ctx)
     : MCII(mcii), MRI(mri) { }
 
   ~SIMCCodeEmitter() { }
 
-  /// \breif Encode the instruction and write it to the OS.
+  /// \brief Encode the instruction and write it to the OS.
   virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
 
   /// \returns the encoding for an MCOperand.
   virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups) const;
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 };
 
 } // End anonymous namespace
@@ -68,7 +70,7 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                            const MCRegisterInfo &MRI,
                                            const MCSubtargetInfo &STI,
                                            MCContext &Ctx) {
-  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
+  return new SIMCCodeEmitter(MCII, MRI, Ctx);
 }
 
 bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
@@ -125,9 +127,10 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
 }
 
 void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
 
-  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
+  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   unsigned bytes = Desc.getSize();
 
@@ -168,7 +171,8 @@ void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
   if (MO.isReg())
     return MRI.getEncodingValue(MO.getReg());
 
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index ee190e4..fde4481 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -9,46 +9,100 @@
 
 class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
 : Processor<Name, itin, Features>;
+
+//===----------------------------------------------------------------------===//
+// R600
+//===----------------------------------------------------------------------===//
 def : Proc<"",           R600_VLIW5_Itin,
     [FeatureR600, FeatureVertexCache]>;
+
 def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600 , FeatureVertexCache]>;
+    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630",       R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
+
 def : Proc<"rs880",      R600_VLIW5_Itin,
-    [FeatureR600]>;
+    [FeatureR600, FeatureWavefrontSize16]>;
+
 def : Proc<"rv670",      R600_VLIW5_Itin,
-    [FeatureR600, FeatureFP64, FeatureVertexCache]>;
+    [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// R700
+//===----------------------------------------------------------------------===//
+
 def : Proc<"rv710",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
 def : Proc<"rv730",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
 def : Proc<"rv770",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureFP64, FeatureVertexCache]>;
+    [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen
+//===----------------------------------------------------------------------===//
+
 def : Proc<"cedar",      R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+     FeatureCFALUBug]>;
+
 def : Proc<"redwood",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+     FeatureCFALUBug]>;
+
 def : Proc<"sumo",       R600_VLIW5_Itin,
-    [FeatureEvergreen]>;
+    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
+
 def : Proc<"juniper",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+
 def : Proc<"cypress",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureFP64, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+     FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Northern Islands
+//===----------------------------------------------------------------------===//
+
 def : Proc<"barts",      R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
 def : Proc<"turks",      R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
 def : Proc<"caicos",     R600_VLIW5_Itin,
-    [FeatureNorthernIslands]>;
+    [FeatureNorthernIslands, FeatureCFALUBug]>;
+
 def : Proc<"cayman",     R600_VLIW4_Itin,
     [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
 
+//===----------------------------------------------------------------------===//
+// Southern Islands
+//===----------------------------------------------------------------------===//
+
 def : Proc<"SI",         SI_Itin, [FeatureSouthernIslands]>;
+
 def : Proc<"tahiti",     SI_Itin, [FeatureSouthernIslands]>;
+
 def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
+
 def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
+
 def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
+
 def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Sea Islands
+//===----------------------------------------------------------------------===//
+
 def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+
 def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+
 def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+
 def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 33d2ca3..3d9015c 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -50,7 +50,7 @@ private:
 
   /// IfCvt pass can generate "disabled" ALU clause marker that need to be
   /// removed and their content affected to the previous alu clause.
-  /// This function parse instructions after CFAlu untill it find a disabled
+  /// This function parse instructions after CFAlu until it find a disabled
   /// CFAlu and merge the content, or an enabled CFAlu.
   void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
 
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index ac3d8f6..f74bef3 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -28,6 +28,172 @@ using namespace llvm;
 
 namespace {
 
+struct CFStack {
+
+  enum StackItem {
+    ENTRY = 0,
+    SUB_ENTRY = 1,
+    FIRST_NON_WQM_PUSH = 2,
+    FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+  };
+
+  const AMDGPUSubtarget &ST;
+  std::vector<StackItem> BranchStack;
+  std::vector<StackItem> LoopStack;
+  unsigned MaxStackSize;
+  unsigned CurrentEntries;
+  unsigned CurrentSubEntries;
+
+  CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
+      // We need to reserve a stack entry for CALL_FS in vertex shaders.
+      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+      CurrentEntries(0), CurrentSubEntries(0) { }
+
+  unsigned getLoopDepth();
+  bool branchStackContains(CFStack::StackItem);
+  bool requiresWorkAroundForInst(unsigned Opcode);
+  unsigned getSubEntrySize(CFStack::StackItem Item);
+  void updateMaxStackSize();
+  void pushBranch(unsigned Opcode, bool isWQM = false);
+  void pushLoop();
+  void popBranch();
+  void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+  return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+       E = BranchStack.end(); I != E; ++I) {
+    if (*I == Item)
+      return true;
+  }
+  return false;
+}
+
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
+      getLoopDepth() > 1)
+    return true;
+
+  if (!ST.hasCFAluBug())
+    return false;
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case AMDGPU::CF_ALU_ELSE_AFTER:
+  case AMDGPU::CF_ALU_BREAK:
+  case AMDGPU::CF_ALU_CONTINUE:
+    if (CurrentSubEntries == 0)
+      return false;
+    if (ST.getWavefrontSize() == 64) {
+      // We are being conservative here.  We only require this work-around if
+      // CurrentSubEntries > 3 &&
+      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+      //
+      // We have to be conservative, because we don't know for certain that
+      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
+      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+      // resources without any problems.
+      return CurrentSubEntries > 3;
+    } else {
+      assert(ST.getWavefrontSize() == 32);
+      // We are being conservative here.  We only require the work-around if
+      // CurrentSubEntries > 7 &&
+      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+      // See the comment on the wavefront size == 64 case for why we are
+      // being conservative.
+      return CurrentSubEntries > 7;
+    }
+  }
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+  switch(Item) {
+  default:
+    return 0;
+  case CFStack::FIRST_NON_WQM_PUSH:
+  assert(!ST.hasCaymanISA());
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
+    // +1 For the push operation.
+    // +2 Extra space required.
+    return 3;
+  } else {
+    // Some documentation says that this is not necessary on Evergreen,
+    // but experimentation has show that we need to allocate 1 extra
+    // sub-entry for the first non-WQM push.
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  }
+  case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+    assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  case CFStack::SUB_ENTRY:
+    return 1;
+  }
+}
+
+void CFStack::updateMaxStackSize() {
+  unsigned CurrentStackSize = CurrentEntries +
+                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+  MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+  CFStack::StackItem Item = CFStack::ENTRY;
+  switch(Opcode) {
+  case AMDGPU::CF_PUSH_EG:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    if (!isWQM) {
+      if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+        Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
+                                             // See comment in
+                                             // CFStack::getSubEntrySize()
+      else if (CurrentEntries > 0 &&
+               ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               !ST.hasCaymanISA() &&
+               !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+        Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+      else
+        Item = CFStack::SUB_ENTRY;
+    } else
+      Item = CFStack::ENTRY;
+    break;
+  }
+  BranchStack.push_back(Item);
+  if (Item == CFStack::ENTRY)
+    CurrentEntries++;
+  else
+    CurrentSubEntries += getSubEntrySize(Item);
+  updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+  LoopStack.push_back(CFStack::ENTRY);
+  CurrentEntries++;
+  updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+  CFStack::StackItem Top = BranchStack.back();
+  if (Top == CFStack::ENTRY)
+    CurrentEntries--;
+  else
+    CurrentSubEntries-= getSubEntrySize(Top);
+  BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+  CurrentEntries--;
+  LoopStack.pop_back();
+}
+
 class R600ControlFlowFinalizer : public MachineFunctionPass {
 
 private:
@@ -300,24 +466,6 @@ private:
     }
   }
 
-  unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
-    switch (ST.getGeneration()) {
-    case AMDGPUSubtarget::R600:
-    case AMDGPUSubtarget::R700:
-      if (hasPush)
-        StackSubEntry += 2;
-      break;
-    case AMDGPUSubtarget::EVERGREEN:
-      if (hasPush)
-        StackSubEntry ++;
-    case AMDGPUSubtarget::NORTHERN_ISLANDS:
-      StackSubEntry += 2;
-      break;
-    default: llvm_unreachable("Not a VLIW4/VLIW5 GPU");
-    }
-    return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
-  }
-
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
     TII (0), TRI(0),
@@ -329,22 +477,19 @@ public:
   virtual bool runOnMachineFunction(MachineFunction &MF) {
     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
-    unsigned MaxStack = 0;
-    unsigned CurrentStack = 0;
-    bool HasPush = false;
+    CFStack CFStack(ST, MFI->ShaderType);
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       if (MFI->ShaderType == 1) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
-        MaxStack = 1;
       }
       std::vector<ClauseFile> FetchClauses, AluClauses;
       std::vector<MachineInstr *> LastAlu(1);
@@ -356,6 +501,7 @@ public:
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
+          LastAlu.back() = 0;
           continue;
         }
 
@@ -365,11 +511,21 @@ public:
         if (MI->getOpcode() == AMDGPU::CF_ALU)
           LastAlu.back() = MI;
         I++;
+        bool RequiresWorkAround =
+            CFStack.requiresWorkAroundForInst(MI->getOpcode());
         switch (MI->getOpcode()) {
         case AMDGPU::CF_ALU_PUSH_BEFORE:
-          CurrentStack++;
-          MaxStack = std::max(MaxStack, CurrentStack);
-          HasPush = true;
+          if (RequiresWorkAround) {
+            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            MI->setDesc(TII->get(AMDGPU::CF_ALU));
+            CfCount++;
+            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+          } else
+            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
@@ -377,8 +533,7 @@ public:
           CfCount++;
           break;
         case AMDGPU::WHILELOOP: {
-          CurrentStack+=4;
-          MaxStack = std::max(MaxStack, CurrentStack);
+          CFStack.pushLoop();
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_WHILE_LOOP))
               .addImm(1);
@@ -391,7 +546,7 @@ public:
           break;
         }
         case AMDGPU::ENDLOOP: {
-          CurrentStack-=4;
+          CFStack.popLoop();
           std::pair<unsigned, std::set<MachineInstr *> > Pair =
               LoopStack.back();
           LoopStack.pop_back();
@@ -429,7 +584,7 @@ public:
           break;
         }
         case AMDGPU::ENDIF: {
-          CurrentStack--;
+          CFStack.popBranch();
           if (LastAlu.back()) {
             ToPopAfter.push_back(LastAlu.back());
           } else {
@@ -504,7 +659,7 @@ public:
             .addImm(Alu->getOperand(8).getImm());
         Alu->eraseFromParent();
       }
-      MFI->StackSize = getHWStackSize(MaxStack, HasPush);
+      MFI->StackSize = CFStack.MaxStackSize;
     }
 
     return false;
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 1781f2a..f2f28fe 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -52,7 +52,7 @@ namespace R600_InstFlag {
 
 #define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
 
-/// \brief Defines for extracting register infomation from register encoding
+/// \brief Defines for extracting register information from register encoding
 #define HW_REG_MASK 0x1ff
 #define HW_CHAN_SHIFT 9
 
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 1bbfd2b..5bd793a 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -25,12 +25,15 @@
 
 using namespace llvm;
 
+namespace llvm {
+  void initializeR600EmitClauseMarkersPass(PassRegistry&);
+}
+
 namespace {
 
-class R600EmitClauseMarkersPass : public MachineFunctionPass {
+class R600EmitClauseMarkers : public MachineFunctionPass {
 
 private:
-  static char ID;
   const R600InstrInfo *TII;
   int Address;
 
@@ -287,8 +290,11 @@ private:
   }
 
 public:
-  R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII(0), Address(0) { }
+  static char ID;
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(0), Address(0) {
+
+    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
+  }
 
   virtual bool runOnMachineFunction(MachineFunction &MF) {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
@@ -314,12 +320,16 @@ public:
   }
 };
 
-char R600EmitClauseMarkersPass::ID = 0;
+char R600EmitClauseMarkers::ID = 0;
 
 } // end anonymous namespace
 
+INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
 
-llvm::FunctionPass *llvm::createR600EmitClauseMarkers(TargetMachine &TM) {
-  return new R600EmitClauseMarkersPass(TM);
+llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+  return new R600EmitClauseMarkers();
 }
 
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index aeee4aa..ca1189d 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -33,8 +33,8 @@ private:
   static char ID;
   const R600InstrInfo *TII;
 
-  bool ExpandInputPerspective(MachineInstr& MI);
-  bool ExpandInputConstant(MachineInstr& MI);
+  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
+      unsigned Op);
 
 public:
   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
@@ -55,6 +55,15 @@ FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
   return new R600ExpandSpecialInstrsPass(TM);
 }
 
+void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
+    const MachineInstr *OldMI, unsigned Op) {
+  int OpIdx = TII->getOperandIdx(*OldMI, Op);
+  if (OpIdx > -1) {
+    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
+    TII->setImmOperand(NewMI, Op, Val);
+  }
+}
+
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
   TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
 
@@ -66,7 +75,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock::iterator I = MBB.begin();
     while (I != MBB.end()) {
       MachineInstr &MI = *I;
-      I = llvm::next(I);
+      I = std::next(I);
 
       // Expand LDS_*_RET instructions
       if (TII->isLDSRetInstr(MI.getOpcode())) {
@@ -325,6 +334,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         if (NotLast) {
           TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
         }
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
       }
       MI.eraseFromParent();
     }
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 0fcb488..6405a82 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -207,7 +207,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
             .addOperand(MI->getOperand(0))
@@ -457,9 +457,9 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     // Instruction is left unmodified if its not the last one of its type
     bool isLastInstructionOfItsType = true;
     unsigned InstExportType = MI->getOperand(1).getImm();
-    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
+    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
          EndBlock = BB->end(); NextExportInst != EndBlock;
-         NextExportInst = llvm::next(NextExportInst)) {
+         NextExportInst = std::next(NextExportInst)) {
       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
@@ -470,7 +470,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
         }
       }
     }
-    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
+    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
@@ -762,7 +762,9 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
                                             SmallVectorImpl<SDValue> &Results,
                                             SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
-  default: return;
+  default:
+    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
+    return;
   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
     return;
   case ISD::LOAD: {
@@ -977,7 +979,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
     HWFalse = DAG.getConstant(0, CompareVT);
   }
   else {
-    assert(!"Unhandled value type in LowerSELECT_CC");
+    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
   }
 
   // Lower this unsupported SELECT_CC into a combination of two supported
@@ -990,7 +992,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       DAG.getCondCode(ISD::SETNE));
 }
 
-/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
+/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
@@ -1099,7 +1101,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
                                     Ptr, DAG.getConstant(2, MVT::i32)));
 
       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
-        assert(!"Truncated and indexed stores not supported yet");
+        llvm_unreachable("Truncated and indexed stores not supported yet");
       } else {
         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
       }
@@ -1113,6 +1115,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
   }
 
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode()) {
+    return Ret;
+  }
   // Lowering for indirect addressing
 
   const MachineFunction &MF = DAG.getMachineFunction();
@@ -1204,6 +1210,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   SDValue Ptr = Op.getOperand(1);
   SDValue LoweredLoad;
 
+  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  if (Ret.getNode()) {
+    SDValue Ops[2];
+    Ops[0] = Ret;
+    Ops[1] = Chain;
+    return DAG.getMergeValues(Ops, 2, DL);
+  }
+
+
   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
     SDValue MergedValues[2] = {
       SplitVectorLoad(Op, DAG),
@@ -1239,7 +1254,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       }
       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
     } else {
-      // non constant ptr cant be folded, keeps it as a v4f32 load
+      // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
           DAG.getConstant(LoadNode->getAddressSpace() -
@@ -1370,14 +1385,19 @@ SDValue R600TargetLowering::LowerFormalArguments(
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                                    AMDGPUAS::CONSTANT_BUFFER_0);
 
+    // i64 isn't a legal type, so the register type used ends up as i32, which
+    // isn't expected here. It attempts to create this sextload, but it ends up
+    // being invalid. Somehow this seems to work with i64 arguments, but breaks
+    // for <1 x i64>.
+
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
     SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
                                  MachinePointerInfo(UndefValue::get(PtrTy)),
                                  MemVT, false, false, 4);
-                                 // 4 is the prefered alignment for
-                                 // the CONSTANT memory space.
+    // 4 is the preferred alignment for
+    // the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
@@ -1442,17 +1462,20 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
       VectorEntry.getOperand(3)
   };
   bool isUnmovable[4] = { false, false, false, false };
-  for (unsigned i = 0; i < 4; i++)
+  for (unsigned i = 0; i < 4; i++) {
     RemapSwizzle[i] = i;
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (i == Idx)
+        isUnmovable[Idx] = true;
+    }
+  }
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
-      if (i == Idx) {
-        isUnmovable[Idx] = true;
-        continue;
-      }
       if (isUnmovable[Idx])
         continue;
       // Swap i and Idx
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index c10257e..22ef728 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -28,9 +28,9 @@ public:
       MachineBasicBlock * BB) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  void ReplaceNodeResults(SDNode * N,
-      SmallVectorImpl<SDValue> &Results,
-      SelectionDAG &DAG) const;
+  virtual void ReplaceNodeResults(SDNode * N,
+                                  SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) const override;
   virtual SDValue LowerFormalArguments(
                                       SDValue Chain,
                                       CallingConv::ID CallConv,
@@ -43,7 +43,7 @@ private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
   /// first nine dwords of a Vertex Buffer.  These implicit parameters are
-  /// lowered to load instructions which retreive the values from the Vertex
+  /// lowered to load instructions which retrieve the values from the Vertex
   /// Buffer.
   SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                  SDLoc DL, unsigned DwordOffset) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index c0827fc..0281dd0 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -716,7 +716,13 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     return false;
   }
 
-  // Get the last instruction in the block.
+  // Remove successive JUMP
+  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+      MachineBasicBlock::iterator PriorI = std::prev(I);
+      if (AllowModify)
+        I->removeFromParent();
+      I = PriorI;
+  }
   MachineInstr *LastInst = I;
 
   // If there is only one terminator instruction, process it.
@@ -778,7 +784,7 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
       It != E; ++It) {
     if (It->getOpcode() == AMDGPU::CF_ALU ||
         It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
-      return llvm::prior(It.base());
+      return std::prev(It.base());
   }
   return MBB.end();
 }
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index 13d9810..d5ff4de 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -138,7 +138,7 @@ namespace llvm {
   /// Same but using const index set instead of MI set.
   bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
 
-  /// \breif Vector instructions are instructions that must fill all
+  /// \brief Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 0346e24..d2075c0 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// R600 Tablegen instruction definitions
+// TableGen definitions for instructions which are available on R600 family
+// GPUs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -335,17 +336,6 @@ def load_param_exti8 : LoadParamFrag<az_extloadi8>;
 def load_param_exti16 : LoadParamFrag<az_extloadi16>;
 
 def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">;
-def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
-def isEG : Predicate<
-  "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
-  "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-  "!Subtarget.hasCaymanISA()">;
-
-def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
-def isEGorCayman : Predicate<"Subtarget.getGeneration() == "
-                             "AMDGPUSubtarget::EVERGREEN"
-                            "|| Subtarget.getGeneration() =="
-                            "AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 def isR600toCayman : Predicate<
                      "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
@@ -642,6 +632,9 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
 def CF_ALU : ALU_CLAUSE<8, "ALU">;
 def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
 def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
 
 def FETCH_CLAUSE : AMDGPUInst <(outs),
 (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
@@ -733,6 +726,9 @@ def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
 def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
 def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
 
+// Add also ftrunc intrinsic pattern
+def : Pat<(ftrunc f32:$src0), (TRUNC $src0)>;
+
 def MOV : R600_1OP <0x19, "MOV", []>;
 
 let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
@@ -1235,6 +1231,10 @@ let Predicates = [isR600] in {
   "JUMP @$ADDR POP:$POP_COUNT"> {
     let CNT = 0;
   }
+  def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+  "PUSH_ELSE @$ADDR"> {
+    let CNT = 0;
+  }
   def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "ELSE @$ADDR POP:$POP_COUNT"> {
     let CNT = 0;
@@ -1257,561 +1257,6 @@ let Predicates = [isR600] in {
 
 }
 
-//===----------------------------------------------------------------------===//
-// R700 Only instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isR700] in {
-  def SIN_r700 : SIN_Common<0x6E>;
-  def COS_r700 : COS_Common<0x6F>;
-}
-
-//===----------------------------------------------------------------------===//
-// Evergreen / Cayman store instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isEGorCayman] in {
-
-class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
-                           string name, list<dag> pattern>
-    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
-                 "MEM_RAT_CACHELESS "#name, pattern>;
-
-class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
-                  list<dag> pattern>
-    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
-                 "MEM_RAT "#name, pattern>;
-
-def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
-  "MSKOR $rw_gpr.XW, $index_gpr",
-  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
-> {
-  let eop = 0;
-}
-
-} // End Predicates = [isEGorCayman]
-
-
-//===----------------------------------------------------------------------===//
-// Evergreen Only instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isEG] in {
-
-def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
-defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
-
-def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
-def MULHI_INT_eg : MULHI_INT_Common<0x90>;
-def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
-def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
-def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
-def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
-def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
-def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
-def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
-def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-def SIN_eg : SIN_Common<0x8D>;
-def COS_eg : COS_Common<0x8E>;
-
-def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
-
-//===----------------------------------------------------------------------===//
-// Memory read/write instructions
-//===----------------------------------------------------------------------===//
-
-let usesCustomInserter = 1 in {
-
-// 32-bit store
-def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
-  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  "STORE_RAW $rw_gpr, $index_gpr, $eop",
-  [(global_store i32:$rw_gpr, i32:$index_gpr)]
->;
-
-// 64-bit store
-def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
-  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
-  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
->;
-
-//128-bit store
-def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
-  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
->;
-
-} // End usesCustomInserter = 1
-
-class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
-
-  // Static fields
-  let VC_INST = 0;
-  let FETCH_TYPE = 2;
-  let FETCH_WHOLE_QUAD = 0;
-  let BUFFER_ID = buffer_id;
-  let SRC_REL = 0;
-  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
-  // to store vertex addresses in any channel, not just X.
-  let SRC_SEL_X = 0;
-
-  let Inst{31-0} = Word0;
-}
-
-class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 1;
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 1; // FMT_8
-}
-
-class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-  let MEGA_FETCH_COUNT = 2;
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 5; // FMT_16
-
-}
-
-class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 4;
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 7;   // Masked
-  let DST_SEL_Z        = 7;   // Masked
-  let DST_SEL_W        = 7;   // Masked
-  let DATA_FORMAT      = 0xD; // COLOR_32
-
-  // This is not really necessary, but there were some GPU hangs that appeared
-  // to be caused by ALU instructions in the next instruction group that wrote
-  // to the $src_gpr registers of the VTX_READ.
-  // e.g.
-  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
-  // %T2_X<def> = MOV %ZERO
-  //Adding this constraint prevents this from happening.
-  let Constraints = "$src_gpr.ptr = $dst_gpr";
-}
-
-class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
-                   (outs R600_Reg64:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 8;
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 1;
-  let DST_SEL_Z        = 7;
-  let DST_SEL_W        = 7;
-  let DATA_FORMAT      = 0x1D; // COLOR_32_32
-}
-
-class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
-                   (outs R600_Reg128:$dst_gpr), pattern> {
-
-  let MEGA_FETCH_COUNT = 16;
-  let DST_SEL_X        =  0;
-  let DST_SEL_Y        =  1;
-  let DST_SEL_Z        =  2;
-  let DST_SEL_W        =  3;
-  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
-
-  // XXX: Need to force VTX_READ_128 instructions to write to the same register
-  // that holds its buffer address to avoid potential hangs.  We can't use
-  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
-  // registers are different sizes.
-}
-
-//===----------------------------------------------------------------------===//
-// VTX Read from parameter memory space
-//===----------------------------------------------------------------------===//
-
-def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
-  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
-  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
-  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
-  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
-  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-//===----------------------------------------------------------------------===//
-// VTX Read from global memory space
-//===----------------------------------------------------------------------===//
-
-// 8-bit reads
-def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-} // End Predicates = [isEG]
-
-//===----------------------------------------------------------------------===//
-// Evergreen / Cayman Instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isEGorCayman] in {
-
-  // BFE_UINT - bit_extract, an optimization for mask and shift
-  // Src0 = Input
-  // Src1 = Offset
-  // Src2 = Width
-  //
-  // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
-  //
-  // Example Usage:
-  // (Offset, Width)
-  //
-  // (0, 8)           = (Input << 24) >> 24  = (Input &  0xff)       >> 0
-  // (8, 8)           = (Input << 16) >> 24  = (Input &  0xffff)     >> 8
-  // (16,8)           = (Input <<  8) >> 24  = (Input &  0xffffff)   >> 16
-  // (24,8)           = (Input <<  0) >> 24  = (Input &  0xffffffff) >> 24
-  def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
-    [(set i32:$dst, (int_AMDIL_bit_extract_u32 i32:$src0, i32:$src1,
-                                               i32:$src2))],
-    VecALU
-  >;
-  def : BFEPattern <BFE_UINT_eg>;
-
-  def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
-  defm : BFIPatterns <BFI_INT_eg>;
-
-  def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
-    [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
-  >;
-  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
-  def : ROTRPattern <BIT_ALIGN_INT_eg>;
-
-  def MULADD_eg : MULADD_Common<0x14>;
-  def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
-  def ASHR_eg : ASHR_Common<0x15>;
-  def LSHR_eg : LSHR_Common<0x16>;
-  def LSHL_eg : LSHL_Common<0x17>;
-  def CNDE_eg : CNDE_Common<0x19>;
-  def CNDGT_eg : CNDGT_Common<0x1A>;
-  def CNDGE_eg : CNDGE_Common<0x1B>;
-  def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
-  def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
-  def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
-    [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
-  >;
-  def DOT4_eg : DOT4_Common<0xBE>;
-  defm CUBE_eg : CUBE_Common<0xC0>;
-
-let hasSideEffects = 1 in {
-  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>;
-}
-
-  def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
-
-  def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
-    let Pattern = [];
-    let Itinerary = AnyALU;
-  }
-
-  def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
-
-  def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
-    let Pattern = [];
-  }
-
-  def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
-
-def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>,
-    R600ALU_Word0,
-    R600ALU_Word1_OP2 <0x54> {
-
-  let dst = 0;
-  let dst_rel = 0;
-  let src0 = 0;
-  let src0_rel = 0;
-  let src0_neg = 0;
-  let src0_abs = 0;
-  let src1 = 0;
-  let src1_rel = 0;
-  let src1_neg = 0;
-  let src1_abs = 0;
-  let write = 0;
-  let omod = 0;
-  let clamp = 0;
-  let last = 1;
-  let bank_swizzle = 0;
-  let pred_sel = 0;
-  let update_exec_mask = 0;
-  let update_pred = 0;
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-
-  let ALUInst = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// LDS Instructions
-//===----------------------------------------------------------------------===//
-class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
-                 list<dag> pattern = []> :
-
-    InstR600 <outs, ins, asm, pattern, XALU>,
-    R600_ALU_LDS_Word0,
-    R600LDS_Word1 {
-
-  bits<6>  offset = 0;
-  let lds_op = op;
-
-  let Word1{27} = offset{0};
-  let Word1{12} = offset{1};
-  let Word1{28} = offset{2};
-  let Word1{31} = offset{3};
-  let Word0{12} = offset{4};
-  let Word0{25} = offset{5};
-
-
-  let Inst{31-0}  = Word0;
-  let Inst{63-32} = Word1;
-
-  let ALUInst = 1;
-  let HasNativeOperands = 1;
-  let UseNamedOperandTable = 1;
-}
-
-class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
-  lds_op,
-  (outs R600_Reg32:$dst),
-  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
-       LAST:$last, R600_Pred:$pred_sel,
-       BANK_SWIZZLE:$bank_swizzle),
-  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
-  pattern
-  > {
-
-  let src1 = 0;
-  let src1_rel = 0;
-  let src2 = 0;
-  let src2_rel = 0;
-
-  let Defs = [OQAP];
-  let usesCustomInserter = 1;
-  let LDS_1A = 1;
-  let DisableEncoding = "$dst";
-}
-
-class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
-                     string dst =""> :
-    R600_LDS <
-  lds_op, outs,
-  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
-       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
-       LAST:$last, R600_Pred:$pred_sel,
-       BANK_SWIZZLE:$bank_swizzle),
-  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
-  pattern
-  > {
-
-  field string BaseOp;
-
-  let src2 = 0;
-  let src2_rel = 0;
-  let LDS_1A1D = 1;
-}
-
-class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
-  let BaseOp = name;
-}
-
-class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
-
-  let BaseOp = name;
-  let usesCustomInserter = 1;
-  let DisableEncoding = "$dst";
-  let Defs = [OQAP];
-}
-
-class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS <
-  lds_op,
-  (outs),
-  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
-       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
-       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
-       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
-  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
-  pattern> {
-  let LDS_1A2D = 1;
-}
-
-def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
-def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
-def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
-  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
->;
-def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
-  [(truncstorei8_local i32:$src1, i32:$src0)]
->;
-def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
-  [(truncstorei16_local i32:$src1, i32:$src0)]
->;
-def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
-  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
->;
-def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
-  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
->;
-def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
-  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
->;
-def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
-  [(set i32:$dst, (sextloadi8_local i32:$src0))]
->;
-def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
-  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
->;
-def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
-  [(set i32:$dst, (sextloadi16_local i32:$src0))]
->;
-def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
-  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
->;
-
-  // TRUNC is used for the FLT_TO_INT instructions to work around a
-  // perceived problem where the rounding modes are applied differently
-  // depending on the instruction and the slot they are in.
-  // See:
-  // https://bugs.freedesktop.org/show_bug.cgi?id=50232
-  // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
-  //
-  // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
-  // which do not need to be truncated since the fp values are 0.0f or 1.0f.
-  // We should look into handling these cases separately.
-  def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
-
-  def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
-
-  // SHA-256 Patterns
-  def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
-
-  def : FROUNDPat <CNDGE_eg>;
-
-  def EG_ExportSwz : ExportSwzInst {
-    let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 0; // VALID_PIXEL_MODE
-    let Word1{21} = eop;
-    let Word1{29-22} = inst;
-    let Word1{30} = 0; // MARK
-    let Word1{31} = 1; // BARRIER
-  }
-  defm : ExportPattern<EG_ExportSwz, 83>;
-
-  def EG_ExportBuf : ExportBufInst {
-    let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 0; // VALID_PIXEL_MODE
-    let Word1{21} = eop;
-    let Word1{29-22} = inst;
-    let Word1{30} = 0; // MARK
-    let Word1{31} = 1; // BARRIER
-  }
-  defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
-
-  def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
-  "TEX $COUNT @$ADDR"> {
-    let POP_COUNT = 0;
-  }
-  def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
-  "VTX $COUNT @$ADDR"> {
-    let POP_COUNT = 0;
-  }
-  def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
-  "LOOP_START_DX10 @$ADDR"> {
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-  def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-  def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
-  "LOOP_BREAK @$ADDR"> {
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-  def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
-  "CONTINUE @$ADDR"> {
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-  def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "JUMP @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
-  }
-  def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "ELSE @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
-  }
-  def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
-    let ADDR = 0;
-    let COUNT = 0;
-    let POP_COUNT = 0;
-  }
-  def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
-  "POP @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
-  }
-  def CF_END_EG :  CF_CLAUSE_EG<0, (ins), "CF_END"> {
-    let COUNT = 0;
-    let POP_COUNT = 0;
-    let ADDR = 0;
-    let END_OF_PROGRAM = 1;
-  }
-
-} // End Predicates = [isEGorCayman]
 
 //===----------------------------------------------------------------------===//
 // Regist loads and stores - for indirect addressing
@@ -1819,215 +1264,11 @@ def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
 
 defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
 
-//===----------------------------------------------------------------------===//
-// Cayman Instructions
-//===----------------------------------------------------------------------===//
-
-let Predicates = [isCayman] in {
-
-def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
->;
-def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
->;
-
-let isVector = 1 in {
-
-def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
-
-def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
-def MULHI_INT_cm : MULHI_INT_Common<0x90>;
-def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
-def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
-def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
-def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
-def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
-def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
-def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
-def SIN_cm : SIN_Common<0x8D>;
-def COS_cm : COS_Common<0x8E>;
-} // End isVector = 1
-
-def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
-
-defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
-
-// RECIP_UINT emulation for Cayman
-// The multiplication scales from [0,1] to the unsigned integer range
-def : Pat <
-  (AMDGPUurecip i32:$src0),
-  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
-                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
->;
-
-  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
-    let ADDR = 0;
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-
-def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-
-class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
-  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
-                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
-                        "STORE_DWORD $rw_gpr, $index_gpr",
-                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
-  let eop = 0; // This bit is not used on Cayman.
-}
-
-def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
-def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
-def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
-
-class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
-
-  // Static fields
-  let VC_INST = 0;
-  let FETCH_TYPE = 2;
-  let FETCH_WHOLE_QUAD = 0;
-  let BUFFER_ID = buffer_id;
-  let SRC_REL = 0;
-  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
-  // to store vertex addresses in any channel, not just X.
-  let SRC_SEL_X = 0;
-  let SRC_SEL_Y = 0;
-  let STRUCTURED_READ = 0;
-  let LDS_REQ = 0;
-  let COALESCED_READ = 0;
-
-  let Inst{31-0} = Word0;
-}
-
-class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 1; // FMT_8
-}
-
-class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-  let DST_SEL_X = 0;
-  let DST_SEL_Y = 7;   // Masked
-  let DST_SEL_Z = 7;   // Masked
-  let DST_SEL_W = 7;   // Masked
-  let DATA_FORMAT = 5; // FMT_16
-
-}
-
-class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_TReg32_X:$dst_gpr), pattern> {
-
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 7;   // Masked
-  let DST_SEL_Z        = 7;   // Masked
-  let DST_SEL_W        = 7;   // Masked
-  let DATA_FORMAT      = 0xD; // COLOR_32
-
-  // This is not really necessary, but there were some GPU hangs that appeared
-  // to be caused by ALU instructions in the next instruction group that wrote
-  // to the $src_gpr registers of the VTX_READ.
-  // e.g.
-  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
-  // %T2_X<def> = MOV %ZERO
-  //Adding this constraint prevents this from happening.
-  let Constraints = "$src_gpr.ptr = $dst_gpr";
-}
-
-class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
-                   (outs R600_Reg64:$dst_gpr), pattern> {
-
-  let DST_SEL_X        = 0;
-  let DST_SEL_Y        = 1;
-  let DST_SEL_Z        = 7;
-  let DST_SEL_W        = 7;
-  let DATA_FORMAT      = 0x1D; // COLOR_32_32
-}
-
-class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
-                   (outs R600_Reg128:$dst_gpr), pattern> {
-
-  let DST_SEL_X        =  0;
-  let DST_SEL_Y        =  1;
-  let DST_SEL_Z        =  2;
-  let DST_SEL_W        =  3;
-  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
-
-  // XXX: Need to force VTX_READ_128 instructions to write to the same register
-  // that holds its buffer address to avoid potential hangs.  We can't use
-  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
-  // registers are different sizes.
-}
-
-//===----------------------------------------------------------------------===//
-// VTX Read from parameter memory space
-//===----------------------------------------------------------------------===//
-def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
-  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
-  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
-  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
-  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
-  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
->;
-
-//===----------------------------------------------------------------------===//
-// VTX Read from global memory space
-//===----------------------------------------------------------------------===//
-
-// 8-bit reads
-def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
->;
-
-def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
-  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
->;
-
-// 32-bit reads
-def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
-  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 64-bit reads
-def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
-  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-// 128-bit reads
-def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
-  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
->;
-
-} // End isCayman
 
 //===----------------------------------------------------------------------===//
 // Branch Instructions
 //===----------------------------------------------------------------------===//
 
-
 def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
   "IF_PREDICATE_SET $src", []>;
 
@@ -2246,7 +1487,7 @@ let Inst{63-32} = Word1;
 //===--------------------------------------------------------------------===//
 //===---------------------------------------------------------------------===//
 // Custom Inserter for Branches and returns, this eventually will be a
-// seperate pass
+// separate pass
 //===---------------------------------------------------------------------===//
 let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
   def BRANCH : ILFormat<(outs), (ins brtarget:$target),
@@ -2358,9 +1599,6 @@ def : Insert_Element <i32, v4i32, 1, sub1>;
 def : Insert_Element <i32, v4i32, 2, sub2>;
 def : Insert_Element <i32, v4i32, 3, sub3>;
 
-def : Vector4_Build <v4f32, f32>;
-def : Vector4_Build <v4i32, i32>;
-
 def : Extract_Element <f32, v2f32, 0, sub0>;
 def : Extract_Element <f32, v2f32, 1, sub1>;
 
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index da2a4d8..d3ffb50 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -24,8 +24,8 @@
 using namespace llvm;
 
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
-
-  DAG = dag;
+  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
+  DAG = static_cast<ScheduleDAGMILive*>(dag);
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
   VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
@@ -72,7 +72,7 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
     // OpenCL Programming Guide :
     // The approx. number of WF that allows TEX inst to hide ALU inst is :
     // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
-    float ALUFetchRationEstimate = 
+    float ALUFetchRationEstimate =
         (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
         (FetchInstCount + Available[IDFetch].size());
     unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
@@ -464,4 +464,3 @@ SUnit* R600SchedStrategy::pickOther(int QID) {
   }
   return SU;
 }
-
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index 97c8cde..b909ff7 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -26,7 +26,7 @@ namespace llvm {
 
 class R600SchedStrategy : public MachineSchedStrategy {
 
-  const ScheduleDAGMI *DAG;
+  const ScheduleDAGMILive *DAG;
   const R600InstrInfo *TII;
   const R600RegisterInfo *TRI;
   MachineRegisterInfo *MRI;
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index cf719c0..767e5e3 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -34,11 +34,11 @@
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
@@ -46,8 +46,8 @@ namespace {
 
 static bool
 isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
-  for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
-      E = MRI.def_end(); It != E; ++It) {
+  for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
+      E = MRI.def_instr_end(); It != E; ++It) {
     return (*It).isImplicitDef();
   }
   if (MRI.isReserved(Reg)) {
@@ -63,7 +63,7 @@ public:
   DenseMap<unsigned, unsigned> RegToChan;
   std::vector<unsigned> UndefReg;
   RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
-    assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
     for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
       MachineOperand &MO = Instr->getOperand(i);
       unsigned Chan = Instr->getOperand(i + 1).getImm();
@@ -213,8 +213,8 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   DEBUG(dbgs() << "    ->"; Pos->dump(););
 
   DEBUG(dbgs() << "  Updating Swizzle:\n");
-  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
-      E = MRI->use_end(); It != E; ++It) {
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
     DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
     SwizzleInput(*It, RemapChan);
     DEBUG((*It).dump());
@@ -261,8 +261,8 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
 }
 
 bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
-  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
-      E = MRI->use_end(); It != E; ++It) {
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
     if (!canSwizzle(*It))
       return false;
   }
@@ -328,8 +328,9 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
       if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
         if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
           unsigned Reg = MI->getOperand(1).getReg();
-          for (MachineRegisterInfo::def_iterator It = MRI->def_begin(Reg),
-              E = MRI->def_end(); It != E; ++It) {
+          for (MachineRegisterInfo::def_instr_iterator
+               It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
+               It != E; ++It) {
             RemoveMI(&(*It));
           }
         }
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index cd9b6ea..b7b7610 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -66,7 +66,7 @@ private:
   }
 
   /// \returns register to PV chan mapping for bundle/single instructions that
-  /// immediatly precedes I.
+  /// immediately precedes I.
   DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
       const {
     DenseMap<unsigned, unsigned> Result;
@@ -311,7 +311,7 @@ public:
       substitutePV(MI, PV);
       MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
       if (isTransSlot) {
-        endPacket(llvm::next(It)->getParent(), llvm::next(It));
+        endPacket(std::next(It)->getParent(), std::next(It));
       }
       return It;
     }
@@ -371,20 +371,20 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
+        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
           break;
       }
       I = MBB->begin();
 
       // Skip empty scheduling regions.
       if (I == RegionEnd) {
-        RegionEnd = llvm::prior(RegionEnd);
+        RegionEnd = std::prev(RegionEnd);
         --RemainingCount;
         continue;
       }
       // Skip regions with one instruction.
-      if (I == llvm::prior(RegionEnd)) {
-        RegionEnd = llvm::prior(RegionEnd);
+      if (I == std::prev(RegionEnd)) {
+        RegionEnd = std::prev(RegionEnd);
         continue;
       }
 
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 3258894..9d24404 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -18,7 +18,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/InstVisitor.h"
+#include "llvm/IR/InstVisitor.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/R600/R700Instructions.td b/lib/Target/R600/R700Instructions.td
new file mode 100644
index 0000000..9aad85d
--- /dev/null
+++ b/lib/Target/R600/R700Instructions.td
@@ -0,0 +1,21 @@
+//===-- R700Instructions.td - R700 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to R700 and newer VLIW4/VLIW5 GPUs
+// - Available only on R700 family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
+
+let Predicates = [isR700] in {
+  def SIN_r700 : SIN_Common<0x6E>;
+  def COS_r700 : COS_Common<0x6F>;
+}
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index 6bbdf59..f9214a8 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -12,10 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "si-annotate-control-flow"
+
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
@@ -98,8 +100,8 @@ public:
   }
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<DominatorTree>();
-    AU.addPreserved<DominatorTree>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
@@ -205,7 +207,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
   if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
 
-    // Handle all non constant incoming values first
+    // Handle all non-constant incoming values first
     for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
       Value *Incoming = Phi->getIncomingValue(i);
       if (isa<ConstantInt>(Incoming))
@@ -253,7 +255,7 @@ void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
     PhiInserter.AddAvailableValue(Parent, Ret);
 
   } else {
-    assert(0 && "Unhandled loop condition!");
+    llvm_unreachable("Unhandled loop condition!");
   }
 }
 
@@ -289,7 +291,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
 /// \brief Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
-  DT = &getAnalysis<DominatorTree>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 3370c79..402f1f4 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -141,8 +141,8 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
 
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
   RC = TRI->getSubRegClass(RC, SubReg);
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
-                                         E = MRI.use_end(); I != E; ++I) {
+  for (MachineRegisterInfo::use_instr_iterator
+       I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
     switch (I->getOpcode()) {
     case AMDGPU::COPY:
       RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
@@ -187,7 +187,7 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
       DstRC == &AMDGPU::M0RegRegClass)
     return false;
 
-  SrcRC = inferRegClassFromDef(TRI, MRI, SrcReg, SrcSubReg);
+  SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
   return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
 }
 
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index d5d2b68..0b55411 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,6 +14,7 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDILIntrinsicInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -24,13 +25,10 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
 
-const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-
 using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM) {
-
   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
 
@@ -76,7 +74,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
 
-  setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
   setOperationAction(ISD::ADDE, MVT::i32, Legal);
@@ -97,13 +94,18 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::i64, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
 
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::i64, Custom);
   setOperationAction(ISD::STORE, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
 
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -125,11 +127,20 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
 
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
@@ -137,10 +148,50 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
-  setTargetDAGCombine(ISD::SELECT_CC);
+  // We only support LOAD/STORE and vector manipulation ops for vectors
+  // with > 4 elements.
+  MVT VecTypes[] = {
+    MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
+  };
+
+  const size_t NumVecTypes = array_lengthof(VecTypes);
+  for (unsigned Type = 0; Type < NumVecTypes; ++Type) {
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      switch(Op) {
+      case ISD::LOAD:
+      case ISD::STORE:
+      case ISD::BUILD_VECTOR:
+      case ISD::BITCAST:
+      case ISD::EXTRACT_VECTOR_ELT:
+      case ISD::INSERT_VECTOR_ELT:
+      case ISD::CONCAT_VECTORS:
+      case ISD::INSERT_SUBVECTOR:
+      case ISD::EXTRACT_SUBVECTOR:
+        break;
+      default:
+        setOperationAction(Op, VecTypes[Type], Expand);
+        break;
+      }
+    }
+  }
+
+  for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) {
+    MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+  }
 
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+  }
+
+  setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
 
   setSchedulingPreference(Sched::RegPressure);
@@ -151,6 +202,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 //===----------------------------------------------------------------------===//
 
 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
+                                                     unsigned AddrSpace,
                                                      bool *IsFast) const {
   // XXX: This depends on the address space and also we may want to revist
   // the alignment values we specify in the DataLayout.
@@ -159,8 +211,15 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
   return VT.bitsGT(MVT::i32);
 }
 
-bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const {
-  return VT.bitsLE(MVT::i16);
+bool SITargetLowering::shouldSplitVectorType(EVT VT) const {
+  return VT.getScalarType().bitsLE(MVT::i16);
+}
+
+bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                         Type *Ty) const {
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  return TII->isInlineConstant(Imm);
 }
 
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
@@ -346,16 +405,16 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
       static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned SuperReg = MI->getOperand(0).getReg();
-    unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+    unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+    unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
             .addOperand(MI->getOperand(1));
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
             .addImm(0);
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
-            .addImm(RSRC_DATA_FORMAT >> 32);
+            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
             .addReg(SubRegHiLo)
             .addImm(AMDGPU::sub0)
@@ -439,13 +498,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::ADD: return LowerADD(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: {
     LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
-    if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-         Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
-        Op.getValueType().isVector()) {
+    if (Op.getValueType().isVector() &&
+        (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+         Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+         (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+          Op.getValueType().getVectorNumElements() > 4))) {
       SDValue MergedValues[2] = {
         SplitVectorLoad(Op, DAG),
         Load->getChain()
@@ -456,6 +516,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
@@ -576,33 +637,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue SITargetLowering::LowerADD(SDValue Op,
-                                   SelectionDAG &DAG) const {
-  if (Op.getValueType() != MVT::i64)
-    return SDValue();
-
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-
-  SDValue Zero = DAG.getConstant(0, MVT::i32);
-  SDValue One = DAG.getConstant(1, MVT::i32);
-
-  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero);
-  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One);
-
-  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero);
-  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One);
-
-  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue);
-
-  SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1);
-  SDValue Carry = AddLo.getValue(1);
-  SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry);
-
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0));
-}
-
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -700,23 +734,42 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
+  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  SDValue MergedValues[2];
+  MergedValues[1] = Load->getChain();
+  if (Ret.getNode()) {
+    MergedValues[0] = Ret;
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
 
-  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
     return SDValue();
+  }
+
+  EVT MemVT = Load->getMemoryVT();
+
+  assert(!MemVT.isVector() && "Private loads should be scalarized");
+  assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int");
 
-  SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
-                                 Load->getBasePtr(), DAG.getConstant(0, MVT::i32));
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
                             DAG.getConstant(2, MVT::i32));
+  Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                    Load->getChain(), Ptr,
+                    DAG.getTargetConstant(0, MVT::i32),
+                    Op.getOperand(2));
+  if (MemVT.getSizeInBits() == 64) {
+    SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                                 DAG.getConstant(1, MVT::i32));
+
+    SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                                    Load->getChain(), IncPtr,
+                                    DAG.getTargetConstant(0, MVT::i32),
+                                    Op.getOperand(2));
+
+    Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
+  }
 
-  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                            Load->getChain(), Ptr,
-                            DAG.getTargetConstant(0, MVT::i32),
-                            Op.getOperand(2));
-  SDValue MergedValues[2] = {
-    Ret,
-    Load->getChain()
-  };
+  MergedValues[0] = Ret;
   return DAG.getMergeValues(MergedValues, 2, DL);
 
 }
@@ -744,6 +797,33 @@ SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
                      Op.getOperand(4));
 }
 
+SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue Cond = Op.getOperand(0);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
+
+  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
+
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
+
+  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
+
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+}
+
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -790,16 +870,47 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (VT.isVector() && VT.getVectorNumElements() >= 8)
       return SplitVectorStore(Op, DAG);
 
+  if (VT == MVT::i1)
+    return DAG.getTruncStore(Store->getChain(), DL,
+                        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+                        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+
   if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
     return SDValue();
 
-  SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(),
                             DAG.getConstant(2, MVT::i32));
   SDValue Chain = Store->getChain();
   SmallVector<SDValue, 8> Values;
 
-  if (VT == MVT::i64) {
+  if (Store->isTruncatingStore()) {
+    unsigned Mask = 0;
+    if (Store->getMemoryVT() == MVT::i8) {
+      Mask = 0xff;
+    } else if (Store->getMemoryVT() == MVT::i16) {
+      Mask = 0xffff;
+    }
+    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                              Chain, Store->getBasePtr(),
+                              DAG.getConstant(0, MVT::i32));
+    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(),
+                                  DAG.getConstant(0x3, MVT::i32));
+    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                   DAG.getConstant(3, MVT::i32));
+    SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(),
+                                      DAG.getConstant(Mask, MVT::i32));
+    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                       MaskedValue, ShiftAmt);
+    SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32,
+                                  DAG.getConstant(32, MVT::i32), ShiftAmt);
+    SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32,
+                                  DAG.getConstant(Mask, MVT::i32),
+                                  RotrAmt);
+    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+    Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+
+    Values.push_back(Dst);
+  } else if (VT == MVT::i64) {
     for (unsigned i = 0; i < 2; ++i) {
       Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
                        Store->getValue(), DAG.getConstant(i, MVT::i32)));
@@ -1047,7 +1158,7 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
   else
     return;
 
-  // Nothing todo if they fit naturaly
+  // Nothing to do if they fit naturally
   if (fitsRegClass(DAG, Operand, RegClass))
     return;
 
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 9933ece..ca73f53 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -26,11 +26,11 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
                                SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
@@ -49,8 +49,11 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
 public:
   SITargetLowering(TargetMachine &tm);
-  bool allowsUnalignedMemoryAccesses(EVT  VT, bool *IsFast) const;
-  virtual bool shouldSplitVectorElementType(EVT VT) const;
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const;
+  virtual bool shouldSplitVectorType(EVT VT) const override;
+
+  virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                 Type *Ty) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 7ef662e..695ec40 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -314,6 +314,12 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
   Counters Result = ZeroCounts;
 
+  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+  // but we also want to wait for any other outstanding transfers before
+  // signalling other hardware blocks
+  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
+    return LastIssued;
+
   // For each register affected by this
   // instruction increase the result sequence
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 53ebaaf..aa2c22c 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -369,6 +369,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let EXP_CNT = 1;
 
   let neverHasSideEffects = 1;
+  let UseNamedOperandTable = 1;
 }
 
 class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index ab55c1b..ab2fe09 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -16,6 +16,7 @@
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -24,12 +25,7 @@ using namespace llvm;
 
 SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
-    RI(tm)
-    { }
-
-const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
-  return RI;
-}
+    RI(tm) { }
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -185,6 +181,67 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
   return Opcode;
 }
 
+void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned SrcReg, bool isKill,
+                                      int FrameIndex,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned KillFlag = isKill ? RegState::Kill : 0;
+
+  if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
+    unsigned Lane = MFI->SpillTracker.getNextLane(MRI);
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+            MFI->SpillTracker.LaneVGPR)
+            .addReg(SrcReg, KillFlag)
+            .addImm(Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
+                                    Lane);
+  } else {
+    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
+      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg)
+              .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+      storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i,
+                          &AMDGPU::SReg_32RegClass, TRI);
+    }
+  }
+}
+
+void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned DestReg, int FrameIndex,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
+     SIMachineFunctionInfo::SpilledReg Spill =
+        MFI->SpillTracker.getSpilledReg(FrameIndex);
+    assert(Spill.VGPR);
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
+            .addReg(Spill.VGPR)
+            .addImm(Spill.Lane);
+  } else {
+    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
+      unsigned Flags = RegState::Define;
+      if (i == 0) {
+        Flags |= RegState::Undef;
+      }
+      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i,
+                           &AMDGPU::SReg_32RegClass, TRI);
+      BuildMI(MBB, MI, DL, get(AMDGPU::COPY))
+              .addReg(DestReg, Flags, RI.getSubRegFromChannel(i))
+              .addReg(SubReg);
+    }
+  }
+}
+
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
 
@@ -213,8 +270,10 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
       return 0;
 
     unsigned Reg = MI->getOperand(1).getReg();
+    unsigned SubReg = MI->getOperand(1).getSubReg();
     MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
     MI->getOperand(2).ChangeToRegister(Reg, false);
+    MI->getOperand(2).setSubReg(SubReg);
   } else {
     MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
@@ -249,6 +308,30 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
 
+bool
+SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+    return MI->getOperand(1).isImm();
+  }
+}
+
+namespace llvm {
+namespace AMDGPU {
+// Helper function generated by tablegen.  We are wrapping this with
+// an SIInstrInfo function that reutrns bool rather than int.
+int isDS(uint16_t Opcode);
+}
+}
+
+bool SIInstrInfo::isDS(uint16_t Opcode) const {
+  return ::AMDGPU::isDS(Opcode) != -1;
+}
+
 int SIInstrInfo::isMIMG(uint16_t Opcode) const {
   return get(Opcode).TSFlags & SIInstrFlags::MIMG;
 }
@@ -277,21 +360,40 @@ bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
 }
 
+bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
+  int32_t Val = Imm.getSExtValue();
+  if (Val >= -16 && Val <= 64)
+    return true;
+
+  // The actual type of the operand does not seem to matter as long
+  // as the bits match one of the inline immediate values.  For example:
+  //
+  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+  // so it is a legal inline immediate.
+  //
+  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+  // floating-point, so it is a legal inline immediate.
+
+  return (APInt::floatToBits(0.0f) == Imm) ||
+         (APInt::floatToBits(1.0f) == Imm) ||
+         (APInt::floatToBits(-1.0f) == Imm) ||
+         (APInt::floatToBits(0.5f) == Imm) ||
+         (APInt::floatToBits(-0.5f) == Imm) ||
+         (APInt::floatToBits(2.0f) == Imm) ||
+         (APInt::floatToBits(-2.0f) == Imm) ||
+         (APInt::floatToBits(4.0f) == Imm) ||
+         (APInt::floatToBits(-4.0f) == Imm);
+}
+
 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
-  if(MO.isImm()) {
-    return MO.getImm() >= -16 && MO.getImm() <= 64;
-  }
+  if (MO.isImm())
+    return isInlineConstant(APInt(32, MO.getImm(), true));
+
   if (MO.isFPImm()) {
-    return MO.getFPImm()->isExactlyValue(0.0)  ||
-           MO.getFPImm()->isExactlyValue(0.5)  ||
-           MO.getFPImm()->isExactlyValue(-0.5) ||
-           MO.getFPImm()->isExactlyValue(1.0)  ||
-           MO.getFPImm()->isExactlyValue(-1.0) ||
-           MO.getFPImm()->isExactlyValue(2.0)  ||
-           MO.getFPImm()->isExactlyValue(-2.0) ||
-           MO.getFPImm()->isExactlyValue(4.0)  ||
-           MO.getFPImm()->isExactlyValue(-4.0);
+    APFloat FpImm = MO.getFPImm()->getValueAPF();
+    return isInlineConstant(FpImm.bitcastToAPInt());
   }
+
   return false;
 }
 
@@ -306,6 +408,47 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
 
+  // Make sure the number of operands is correct.
+  const MCInstrDesc &Desc = get(Opcode);
+  if (!Desc.isVariadic() &&
+      Desc.getNumOperands() != MI->getNumExplicitOperands()) {
+     ErrInfo = "Instruction has wrong number of operands.";
+     return false;
+  }
+
+  // Make sure the register classes are correct
+  for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+    switch (Desc.OpInfo[i].OperandType) {
+    case MCOI::OPERAND_REGISTER:
+      break;
+    case MCOI::OPERAND_IMMEDIATE:
+      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
+        ErrInfo = "Expected immediate, but got non-immediate";
+        return false;
+      }
+      // Fall-through
+    default:
+      continue;
+    }
+
+    if (!MI->getOperand(i).isReg())
+      continue;
+
+    int RegClass = Desc.OpInfo[i].RegClass;
+    if (RegClass != -1) {
+      unsigned Reg = MI->getOperand(i).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+
+      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
+      if (!RC->contains(Reg)) {
+        ErrInfo = "Operand has incorrect register class.";
+        return false;
+      }
+    }
+  }
+
+
   // Verify VOP*
   if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
     unsigned ConstantBusCount = 0;
@@ -373,10 +516,20 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   case AMDGPU::COPY: return AMDGPU::COPY;
   case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::S_MOV_B32:
+    return MI.getOperand(1).isReg() ?
+           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
   case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
   case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
+  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
+  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
+  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
+  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
+  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
+  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
@@ -432,6 +585,84 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   MO.ChangeToRegister(Reg, false);
 }
 
+unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC)
+                                         const {
+  assert(SuperReg.isReg());
+
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  unsigned SubReg = MRI.createVirtualRegister(SubRC);
+
+  // Just in case the super register is itself a sub-register, copy it to a new
+  // value so we don't need to wory about merging its subreg index with the
+  // SubIdx passed to this function.  The register coalescer should be able to
+  // eliminate this extra copy.
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
+          NewSuperReg)
+          .addOperand(SuperReg);
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
+          SubReg)
+          .addReg(NewSuperReg, 0, SubIdx);
+  return SubReg;
+}
+
+MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
+  MachineBasicBlock::iterator MII,
+  MachineRegisterInfo &MRI,
+  MachineOperand &Op,
+  const TargetRegisterClass *SuperRC,
+  unsigned SubIdx,
+  const TargetRegisterClass *SubRC) const {
+  if (Op.isImm()) {
+    // XXX - Is there a better way to do this?
+    if (SubIdx == AMDGPU::sub0)
+      return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
+    if (SubIdx == AMDGPU::sub1)
+      return MachineOperand::CreateImm(Op.getImm() >> 32);
+
+    llvm_unreachable("Unhandled register index for immediate");
+  }
+
+  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
+                                       SubIdx, SubRC);
+  return MachineOperand::CreateReg(SubReg, false);
+}
+
+unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
+                                    MachineBasicBlock::iterator MI,
+                                    MachineRegisterInfo &MRI,
+                                    const TargetRegisterClass *RC,
+                                    const MachineOperand &Op) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned Dst = MRI.createVirtualRegister(RC);
+
+  MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                             LoDst)
+    .addImm(Op.getImm() & 0xFFFFFFFF);
+  MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
+                             HiDst)
+    .addImm(Op.getImm() >> 32);
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
+    .addReg(LoDst)
+    .addImm(AMDGPU::sub0)
+    .addReg(HiDst)
+    .addImm(AMDGPU::sub1);
+
+  Worklist.push_back(Lo);
+  Worklist.push_back(Hi);
+
+  return Dst;
+}
+
 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
@@ -549,6 +780,110 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       MI->getOperand(i).setReg(DstReg);
     }
   }
+
+  // Legalize MUBUF* instructions
+  // FIXME: If we start using the non-addr64 instructions for compute, we
+  // may need to legalize them here.
+
+  int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::srsrc);
+  int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                             AMDGPU::OpName::vaddr);
+  if (SRsrcIdx != -1 && VAddrIdx != -1) {
+    const TargetRegisterClass *VAddrRC =
+        RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
+
+    if(VAddrRC->getSize() == 8 &&
+       MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
+      // We have a MUBUF instruction that uses a 64-bit vaddr register and
+      // srsrc has the incorrect register class.  In order to fix this, we
+      // need to extract the pointer from the resource descriptor (srsrc),
+      // add it to the value of vadd,  then store the result in the vaddr
+      // operand.  Then, we need to set the pointer field of the resource
+      // descriptor to zero.
+
+      MachineBasicBlock &MBB = *MI->getParent();
+      MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
+      MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
+      unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
+      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+      unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+
+      // SRsrcPtrLo = srsrc:sub0
+      SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
+          &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+
+      // SRsrcPtrHi = srsrc:sub1
+      SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
+          &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+
+      // VAddrLo = vaddr:sub0
+      VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
+          &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+
+      // VAddrHi = vaddr:sub1
+      VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
+          &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+
+      // NewVaddrLo = SRsrcPtrLo + VAddrLo
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
+              NewVAddrLo)
+              .addReg(SRsrcPtrLo)
+              .addReg(VAddrLo)
+              .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
+
+      // NewVaddrHi = SRsrcPtrHi + VAddrHi
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
+              NewVAddrHi)
+              .addReg(SRsrcPtrHi)
+              .addReg(VAddrHi)
+              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
+              .addReg(AMDGPU::VCC, RegState::Implicit);
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewVAddr)
+              .addReg(NewVAddrLo)
+              .addImm(AMDGPU::sub0)
+              .addReg(NewVAddrHi)
+              .addImm(AMDGPU::sub1);
+
+      // Zero64 = 0
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
+              Zero64)
+              .addImm(0);
+
+      // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+              SRsrcFormatLo)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+
+      // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+              SRsrcFormatHi)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+
+      // NewSRsrc = {Zero64, SRsrcFormat}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewSRsrc)
+              .addReg(Zero64)
+              .addImm(AMDGPU::sub0_sub1)
+              .addReg(SRsrcFormatLo)
+              .addImm(AMDGPU::sub2)
+              .addReg(SRsrcFormatHi)
+              .addImm(AMDGPU::sub3);
+
+      // Update the instruction to use NewVaddr
+      MI->getOperand(VAddrIdx).setReg(NewVAddr);
+      // Update the instruction to use NewSRsrc
+      MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
+    }
+  }
 }
 
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
@@ -557,11 +892,68 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
   while (!Worklist.empty()) {
     MachineInstr *Inst = Worklist.pop_back_val();
-    unsigned NewOpcode = getVALUOp(*Inst);
-    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+    MachineBasicBlock *MBB = Inst->getParent();
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+    // Handle some special cases
+    switch(Inst->getOpcode()) {
+    case AMDGPU::S_MOV_B64: {
+      DebugLoc DL = Inst->getDebugLoc();
+
+      // If the source operand is a register we can replace this with a
+      // copy.
+      if (Inst->getOperand(1).isReg()) {
+        MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
+          .addOperand(Inst->getOperand(0))
+          .addOperand(Inst->getOperand(1));
+        Worklist.push_back(Copy);
+      } else {
+        // Otherwise, we need to split this into two movs, because there is
+        // no 64-bit VALU move instruction.
+        unsigned Reg = Inst->getOperand(0).getReg();
+        unsigned Dst = split64BitImm(Worklist,
+                                     Inst,
+                                     MRI,
+                                     MRI.getRegClass(Reg),
+                                     Inst->getOperand(1));
+        MRI.replaceRegWith(Reg, Dst);
+      }
+      Inst->eraseFromParent();
+      continue;
+    }
+    case AMDGPU::S_AND_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      Inst->eraseFromParent();
       continue;
 
-    MachineRegisterInfo &MRI = Inst->getParent()->getParent()->getRegInfo();
+    case AMDGPU::S_OR_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XOR_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOT_B64:
+      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFM_B64:
+      llvm_unreachable("Moving this op to VALU not implemented");
+    }
+
+    unsigned NewOpcode = getVALUOp(*Inst);
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+      // We cannot move this instruction to the VALU, so we should try to
+      // legalize its operands instead.
+      legalizeOperands(Inst);
+      continue;
+    }
 
     // Use the new VALU Opcode.
     const MCInstrDesc &NewDesc = get(NewOpcode);
@@ -620,7 +1012,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
     for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
            E = MRI.use_end(); I != E; ++I) {
-      MachineInstr &UseMI = *I;
+      MachineInstr &UseMI = *I->getParent();
       if (!canReadVGPR(UseMI, I.getOperandNo())) {
         Worklist.push_back(&UseMI);
       }
@@ -642,6 +1034,69 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::VReg_32RegClass;
 }
 
+void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                     MachineInstr *Inst,
+                                     unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  MachineOperand &Src1 = Inst->getOperand(2);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src1RC = Src1.isReg() ?
+    MRI.getRegClass(Src1.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub0, Src1SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0)
+    .addOperand(SrcReg1Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1)
+    .addOperand(SrcReg1Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 4af6348..c537038 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -25,42 +25,79 @@ class SIInstrInfo : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
 
-  MachineInstrBuilder buildIndirectIndexLoop(MachineBasicBlock &MBB,
-                                             MachineBasicBlock::iterator I,
-                                             unsigned OffsetVGPR,
-                                             unsigned MovRelOp,
-                                             unsigned Dst,
-                                             unsigned Src0) const;
-  // If you add or remove instructions from this function, you will
+  unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
+                              MachineRegisterInfo &MRI,
+                              MachineOperand &SuperReg,
+                              const TargetRegisterClass *SuperRC,
+                              unsigned SubIdx,
+                              const TargetRegisterClass *SubRC) const;
+  MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC) const;
+
+  unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
+                         MachineBasicBlock::iterator MI,
+                         MachineRegisterInfo &MRI,
+                         const TargetRegisterClass *RC,
+                         const MachineOperand &Op) const;
+
+  void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
+                          MachineInstr *Inst, unsigned Opcode) const;
+
 
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
-  const SIRegisterInfo &getRegisterInfo() const;
+  const SIRegisterInfo &getRegisterInfo() const {
+    return RI;
+  }
 
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
 
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
   unsigned commuteOpcode(unsigned Opcode) const;
 
   virtual MachineInstr *commuteInstruction(MachineInstr *MI,
                                            bool NewMI=false) const;
 
-  virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
+  bool isTriviallyReMaterializable(const MachineInstr *MI,
+                                   AliasAnalysis *AA = 0) const;
+
+  virtual unsigned getIEQOpcode() const {
+    llvm_unreachable("Unimplemented");
+  }
+
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
                               unsigned DstReg, unsigned SrcReg) const;
   virtual bool isMov(unsigned Opcode) const;
 
   virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  bool isDS(uint16_t Opcode) const;
   int isMIMG(uint16_t Opcode) const;
   int isSMRD(uint16_t Opcode) const;
   bool isVOP1(uint16_t Opcode) const;
   bool isVOP2(uint16_t Opcode) const;
   bool isVOP3(uint16_t Opcode) const;
   bool isVOPC(uint16_t Opcode) const;
+  bool isInlineConstant(const APInt &Imm) const;
   bool isInlineConstant(const MachineOperand &MO) const;
   bool isLiteralConstant(const MachineOperand &MO) const;
 
@@ -69,6 +106,7 @@ public:
 
   bool isSALUInstr(const MachineInstr &MI) const;
   static unsigned getVALUOp(const MachineInstr &MI);
+
   bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
 
   /// \brief Return the correct register class for \p OpNo.  For target-specific
@@ -132,6 +170,9 @@ namespace AMDGPU {
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
 
+  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+
+
 } // End namespace AMDGPU
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 4cd0daa..e05ab65 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -75,15 +75,14 @@ def HI32f : SDNodeXForm<fpimm, [{
   return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
 }]>;
 
-def IMM8bitDWORD : ImmLeaf <
-  i32, [{
-    return (Imm & ~0x3FC) == 0;
-  }], SDNodeXForm<imm, [{
-    return CurDAG->getTargetConstant(
-      N->getZExtValue() >> 2, MVT::i32);
-  }]>
+def IMM8bitDWORD : PatLeaf <(imm),
+  [{return (N->getZExtValue() & ~0x3FC) == 0;}]
 >;
 
+def as_dword_i32imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 2, MVT::i32);
+}]>;
+
 def as_i1imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i1);
 }]>;
@@ -96,10 +95,27 @@ def as_i16imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
 }]>;
 
+def as_i32imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
+}]>;
+
+def IMM8bit : PatLeaf <(imm),
+  [{return isUInt<8>(N->getZExtValue());}]
+>;
+
 def IMM12bit : PatLeaf <(imm),
   [{return isUInt<12>(N->getZExtValue());}]
 >;
 
+def IMM16bit : PatLeaf <(imm),
+  [{return isUInt<16>(N->getZExtValue());}]
+>;
+
+def mubuf_vaddr_offset : PatFrag<
+  (ops node:$ptr, node:$offset, node:$imm_offset),
+  (add (add node:$ptr, node:$offset), node:$imm_offset)
+>;
+
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
   return
     (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
@@ -121,7 +137,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
   return false;
 }]>;
 
-def FRAMEri64 : Operand<iPTR> {
+def FRAMEri32 : Operand<iPTR> {
   let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
 }
 
@@ -290,10 +306,10 @@ multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern,
   : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>;
 
 multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
-                     string revOp = opName> {
+                     RegisterClass src0_rc, string revOp = opName> {
 
   def _e32 : VOP2 <
-    op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1),
+    op, (outs VReg_32:$dst), (ins src0_rc:$src0, VReg_32:$src1),
     opName#"_e32 $dst, $src0, $src1", pattern
   >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
 
@@ -375,39 +391,72 @@ class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
 // Vector I/O classes
 //===----------------------------------------------------------------------===//
 
-class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
+    DS <op, outs, ins, asm, pat> {
+  bits<16> offset;
+
+  // Single load interpret the 2 i8imm operands as a single i16 offset.
+  let offset0 = offset{7-0};
+  let offset1 = offset{15-8};
+}
+
+class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
-       i8imm:$offset0, i8imm:$offset1),
-  asm#" $vdst, $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, i16imm:$offset),
+  asm#" $vdst, $addr, $offset, [M0]",
   []> {
+  let data0 = 0;
+  let data1 = 0;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+  op,
+  (outs regClass:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, i8imm:$offset0, i8imm:$offset1),
+  asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
+  []> {
+  let data0 = 0;
+  let data1 = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
-       i8imm:$offset0, i8imm:$offset1),
-  asm#" $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i16imm:$offset),
+  asm#" $addr, $data0, $offset [M0]",
   []> {
+  let data1 = 0;
   let mayStore = 1;
   let mayLoad = 0;
   let vdst = 0;
 }
 
-class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS <
+class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i8imm:$offset0, i8imm:$offset1),
+  asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+  let vdst = 0;
+}
+
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i8imm:$offset0,
-       i8imm:$offset1),
-  asm#" $gds, $vdst, $addr, $data0, $offset0, $offset1, [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i16imm:$offset),
+  asm#" $vdst, $addr, $data0, $offset, [M0]",
   []> {
+
+  let data1 = 0;
   let mayStore = 1;
   let mayLoad = 1;
-  let data1 = 0;
 }
 
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
@@ -425,26 +474,48 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
 
 multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
-  let glc = 0, lds = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */,
-                                          mayLoad = 1 in {
-
-  let offen = 1, idxen = 0, addr64 = 0, offset = 0 in {
-    def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
-                         (ins SReg_128:$srsrc, VReg_32:$vaddr),
-                         asm#" $vdata, $srsrc + $vaddr", []>;
-  }
-
-  let offen = 0, idxen = 1, addr64 = 0 in {
-    def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
-                         (ins SReg_128:$srsrc, VReg_32:$vaddr, i16imm:$offset),
-                         asm#" $vdata, $srsrc[$vaddr] + $offset", []>;
-  }
+  let lds = 0, mayLoad = 1 in {
+
+    let addr64 = 0 in {
+
+      let offen = 0, idxen = 0 in {
+        def _OFFSET : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
+                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             i1imm:$slc, i1imm:$tfe),
+                             asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+
+      let offen = 1, idxen = 0, offset = 0 in {
+        def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
+                             SSrc_32:$soffset, i1imm:$glc, i1imm:$slc,
+                             i1imm:$tfe),
+                             asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+
+      let offen = 0, idxen = 1 in {
+        def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
+                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             i1imm:$slc, i1imm:$tfe),
+                             asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+
+      let offen = 1, idxen = 1 in {
+        def _BOTHEN : MUBUF <op, (outs regClass:$vdata),
+                             (ins SReg_128:$srsrc, VReg_64:$vaddr,
+                             SSrc_32:$soffset, i1imm:$glc,
+                             i1imm:$slc, i1imm:$tfe),
+                             asm#" $vdata, $srsrc[$vaddr[0]] + $vaddr[1] + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+      }
+    }
 
-  let offen = 0, idxen = 0, addr64 = 1 in {
-    def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
-                         (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
-                         asm#" $vdata, $srsrc + $vaddr + $offset", []>;
-  }
+    let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
+      def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                           asm#" $vdata, $srsrc + $vaddr + $offset", []>;
+    }
   }
 }
 
@@ -598,4 +669,12 @@ def getCommuteOrig : InstrMapping {
   let ValueCols = [["1"]];
 }
 
+def isDS : InstrMapping {
+  let FilterClass = "DS";
+  let RowFields = ["Inst"];
+  let ColFields = ["Size"];
+  let KeyCol = ["8"];
+  let ValueCols = [["8"]];
+}
+
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 76f05eb..5232139 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -22,9 +22,16 @@ def InterpSlot : Operand<i32> {
   let PrintMethod = "printInterpSlot";
 }
 
+def SendMsgImm : Operand<i32> {
+  let PrintMethod = "printSendMsg";
+}
+
 def isSI : Predicate<"Subtarget.getGeneration() "
                       ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
 
+def isCI : Predicate<"Subtarget.getGeneration() "
+                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
+
 def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
 let Predicates = [isSI] in {
@@ -401,11 +408,25 @@ def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
 def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
 def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
+def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "DS_WRITE_B64", VReg_64>;
+
 def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
 def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
 def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
 def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
 def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
+def DS_READ_B64 : DS_Load_Helper <0x00000076, "DS_READ_B64", VReg_64>;
+
+// 2 forms.
+def DS_WRITE2_B32 : DS_Load2_Helper <0x0000000E, "DS_WRITE2_B32", VReg_64>;
+def DS_WRITE2_B64 : DS_Load2_Helper <0x0000004E, "DS_WRITE2_B64", VReg_128>;
+
+def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "DS_READ2_B32", VReg_64>;
+def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
+
+// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
+// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
+
 
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -624,7 +645,18 @@ let neverHasSideEffects = 1, isMoveImm = 1 in {
 defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
 } // End neverHasSideEffects = 1, isMoveImm = 1
 
-defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
+let Uses = [EXEC] in {
+
+def V_READFIRSTLANE_B32 : VOP1 <
+  0x00000002,
+  (outs SReg_32:$vdst),
+  (ins VReg_32:$src0),
+  "V_READFIRSTLANE_B32 $vdst, $src0",
+  []
+>;
+
+}
+
 defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
   [(set i32:$dst, (fp_to_sint f64:$src0))]
 >;
@@ -826,17 +858,25 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
   []
 >;
-} // End hasSideEffects
 //def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
 //def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
 //def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
-//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
+
+let Uses = [EXEC] in {
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+  > {
+    let DisableEncoding = "$m0";
+  }
+} // End Uses = [EXEC]
+
 //def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
 //def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
 //def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
 //def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
 //def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
 //def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+} // End hasSideEffects
 
 def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
   (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
@@ -864,20 +904,21 @@ def : Pat <
   (EXTRACT_SUBREG $val, sub0)
 >;
 
-//use two V_CNDMASK_B32_e64 instructions for f64
-def : Pat <
-  (f64 (select i1:$src2, f64:$src1, f64:$src0)),
-  (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-  (V_CNDMASK_B32_e64 (EXTRACT_SUBREG $src0, sub0),
-                     (EXTRACT_SUBREG $src1, sub0),
-                     $src2), sub0),
-  (V_CNDMASK_B32_e64 (EXTRACT_SUBREG $src0, sub1),
-                     (EXTRACT_SUBREG $src1, sub1),
-                     $src2), sub1)
+def V_READLANE_B32 : VOP2 <
+  0x00000001,
+  (outs SReg_32:$vdst),
+  (ins VReg_32:$src0, SSrc_32:$vsrc1),
+  "V_READLANE_B32 $vdst, $src0, $vsrc1",
+  []
 >;
 
-defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
-defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
+def V_WRITELANE_B32 : VOP2 <
+  0x00000002,
+  (outs VReg_32:$vdst),
+  (ins SReg_32:$src0, SSrc_32:$vsrc1),
+  "V_WRITELANE_B32 $vdst, $src0, $vsrc1",
+  []
+>;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
@@ -924,51 +965,32 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
 
 defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
-  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
->;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
-  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
->;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
-  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
->;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
-  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
->;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
 
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
->;
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
 defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
 
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
->;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
 let hasPostISelHook = 1 in {
 
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
 
 }
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
->;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
->;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
->;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>;
 
 } // End isCommutable = 1
 
-defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
+defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
 defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
@@ -979,14 +1001,16 @@ defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", []>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", []>;
-defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">;
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", [], VSrc_32>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", [], VSrc_32>;
+defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
+                              "V_SUB_I32">;
 
 let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>;
-defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], "V_SUBB_U32">;
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", [], VReg_32>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", [], VReg_32>;
+defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
+                               "V_SUBB_U32">;
 } // End Uses = [VCC]
 } // End isCommutable = 1, Defs = [VCC]
 
@@ -1033,9 +1057,16 @@ def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
 def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
 def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
 def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
-def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
-def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
-def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+
+let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
+def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
+}
+
+def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
+  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
 defm : BFIPatterns <V_BFI_B32>;
 def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
   [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
@@ -1154,10 +1185,18 @@ def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
 } // End Uses = [SCC]
 } // End Defs = [SCC]
 
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
 
 def S_CSELECT_B32 : SOP2 <
   0x0000000a, (outs SReg_32:$dst),
@@ -1167,7 +1206,9 @@ def S_CSELECT_B32 : SOP2 <
 
 def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
 
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
 
 def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
   [(set i64:$dst, (and i64:$src0, i64:$src1))]
@@ -1178,13 +1219,23 @@ def : Pat <
   (S_AND_B64 $src0, $src1)
 >;
 
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
 def : Pat <
   (i1 (or i1:$src0, i1:$src1)),
   (S_OR_B64 $src0, $src1)
 >;
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
+
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
 def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
   [(set i1:$dst, (xor i1:$src0, i1:$src1))]
 >;
@@ -1305,8 +1356,8 @@ def SI_END_CF : InstSI <
 
 def SI_KILL : InstSI <
   (outs),
-  (ins VReg_32:$src),
-  "SI_KIL $src",
+  (ins VSrc_32:$src),
+  "SI_KILL $src",
   [(int_AMDGPU_kill f32:$src)]
 >;
 
@@ -1315,13 +1366,13 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
-//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri64, ADDRIndirect>;
+//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>;
 
 let UseNamedOperandTable = 1 in {
 
 def SI_RegisterLoad : AMDGPUShaderInst <
   (outs VReg_32:$dst, SReg_64:$temp),
-  (ins FRAMEri64:$addr, i32imm:$chan),
+  (ins FRAMEri32:$addr, i32imm:$chan),
   "", []
 > {
   let isRegisterLoad = 1;
@@ -1330,7 +1381,7 @@ def SI_RegisterLoad : AMDGPUShaderInst <
 
 class SIRegStore<dag outs> : AMDGPUShaderInst <
   outs,
-  (ins VReg_32:$val, FRAMEri64:$addr, i32imm:$chan),
+  (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
   "", []
 > {
   let isRegisterStore = 1;
@@ -1397,13 +1448,13 @@ def : Pat<
 
 def : Pat <
   (int_AMDGPU_kilp),
-  (SI_KILL (V_MOV_B32_e32 0xbf800000))
+  (SI_KILL 0xbf800000)
 >;
 
 /* int_SI_vs_load_input */
 def : Pat<
   (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;
 
 /* int_SI_export */
@@ -1637,17 +1688,25 @@ def : BitConvert <f64, i64, VReg_64>;
 def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
 def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <i64, v2i32, VReg_64>;
 
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
 def : BitConvert <v4i32, i128,  VReg_128>;
 def : BitConvert <i128, v4i32,  VReg_128>;
 
+def : BitConvert <v8f32, v8i32, SReg_256>;
+def : BitConvert <v8i32, v8f32, SReg_256>;
 def : BitConvert <v8i32, v32i8, SReg_256>;
 def : BitConvert <v32i8, v8i32, SReg_256>;
 def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v8i32, v8f32, VReg_256>;
+def : BitConvert <v8f32, v8i32, VReg_256>;
 def : BitConvert <v32i8, v8i32, VReg_256>;
 
+def : BitConvert <v16i32, v16f32, VReg_512>;
+def : BitConvert <v16f32, v16i32, VReg_512>;
+
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
@@ -1658,16 +1717,30 @@ def : Pat <
    0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
 >;
 
+/********** ================================ **********/
+/********** Floating point absolute/negative **********/
+/********** ================================ **********/
+
+// Manipulate the sign bit directly, as e.g. using the source negation modifier
+// in V_ADD_F32_e64 $src, 0, [...] does not result in -0.0 for $src == +0.0,
+// breaking the piglit *s-floatBitsToInt-neg* tests
+
+// TODO: Look into not implementing isFNegFree/isFAbsFree for SI, and possibly
+// removing these patterns
+
+def : Pat <
+  (fneg (fabs f32:$src)),
+  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+>;
+
 def : Pat <
   (fabs f32:$src),
-  (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
-   1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) /* Clear sign bit */
 >;
 
 def : Pat <
   (fneg f32:$src),
-  (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
-   0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */)
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Toggle sign bit */
 >;
 
 /********** ================== **********/
@@ -1794,10 +1867,18 @@ def : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
 >;
 
+class Ext32Pat <SDNode ext> : Pat <
+  (i32 (ext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+>;
+
+def : Ext32Pat <zext>;
+def : Ext32Pat <anyext>;
+
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
   (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset)
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
 >;
 
 // 2. Offset loaded in an 32bit SGPR
@@ -1809,7 +1890,7 @@ def : Pat <
 // 3. Offset in an 32Bit VGPR
 def : Pat <
   (SIload_constant i128:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
+  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
 >;
 
 // The multiplication scales from [0,1] to the unsigned integer range
@@ -1839,35 +1920,47 @@ def : Pat <
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
 
-class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
-  (frag i32:$src0),
-  (vt (inst 0, $src0, $src0, $src0, 0, 0))
->;
+multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))),
+    (inst (i1 0), $ptr, (as_i16imm $offset))
+  >;
 
-def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
-def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
-def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
-def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
-def : DSReadPat <DS_READ_B32, i32, local_load>;
-def : Pat <
-    (local_load i32:$src0),
-    (i32 (DS_READ_B32 0, $src0, $src0, $src0, 0, 0))
->;
+  def : Pat <
+    (frag i32:$src0),
+    (vt (inst 0, $src0, 0))
+  >;
+}
 
-class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
-  (frag i32:$src1, i32:$src0),
-  (inst 0, $src0, $src1, $src1, 0, 0)
->;
+defm : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
+defm : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
+defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+defm : DSReadPat <DS_READ_B32, i32, local_load>;
+defm : DSReadPat <DS_READ_B64, i64, local_load>;
 
-def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
-def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
-def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))),
+    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag vt:$src1, i32:$src0),
+    (inst 0, $src0, $src1, 0)
+  >;
+}
+
+defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
+defm : DSWritePat <DS_WRITE_B64, i64, local_store>;
 
 def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
-           (DS_ADD_U32_RTN 0, $ptr, $val, 0, 0)>;
+           (DS_ADD_U32_RTN 0, $ptr, $val, 0)>;
 
 def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
-           (DS_SUB_U32_RTN 0, $ptr, $val, 0, 0)>;
+           (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
 
 /********** ================== **********/
 /**********   SMRD Patterns    **********/
@@ -1877,8 +1970,8 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
   // 1. Offset as 8bit DWORD immediate
   def : Pat <
-    (constant_load (SIadd64bit32bit i64:$sbase, IMM8bitDWORD:$offset)),
-    (vt (Instr_IMM $sbase, IMM8bitDWORD:$offset))
+    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
   >;
 
   // 2. Offset loaded in an 32bit SGPR
@@ -1911,6 +2004,11 @@ defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
 multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
                               PatFrag global_ld, PatFrag constant_ld> {
   def : Pat <
+    (vt (global_ld (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset))),
+    (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset))
+  >;
+
+  def : Pat <
     (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))),
     (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
   >;
@@ -1953,6 +2051,16 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
 multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> {
 
   def : Pat <
+    (st vt:$value, (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset)),
+    (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset))
+  >;
+
+  def : Pat <
+    (st vt:$value, (add i64:$ptr, IMM12bit:$offset)),
+    (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
+  >;
+
+  def : Pat <
     (st vt:$value, i64:$ptr),
     (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
   >;
@@ -1970,6 +2078,50 @@ defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>;
 defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>;
 defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>;
 
+// BUFFER_LOAD_DWORD*, addr64=0
+multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
+                             MUBUF bothen> {
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
+            (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm, 1, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+           (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
+           (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword i128:$rsrc, v2i32:$vaddr, i32:$soffset,
+                                  imm, 1, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+            (as_i1imm $tfe))
+  >;
+}
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
+                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
+defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
+                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
+defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
+                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -1991,11 +2143,60 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
 def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
 def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
+let Predicates = [isCI] in {
+
+// Sea island new arithmetic instructinos
+let neverHasSideEffects = 1 in {
+defm V_TRUNC_F64 : VOP1_64 <0x00000017, "V_TRUNC_F64",
+  [(set f64:$dst, (ftrunc f64:$src0))]
+>;
+defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
+  [(set f64:$dst, (fceil f64:$src0))]
+>;
+defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
+  [(set f64:$dst, (ffloor f64:$src0))]
+>;
+
+defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", []>;
+
+def V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
+def V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
+def V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
+def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
+
+// XXX - Does this set VCC?
+def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
+} // End neverHasSideEffects = 1
+
+// Remaining instructions:
+// FLAT_*
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// S_DCACHE_INV_VOL
+// V_EXP_LEGACY_F32
+// V_LOG_LEGACY_F32
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
+
+} // End Predicates = [isCI]
+
+
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
 /********** ====================== **********/
 
-multiclass SI_INDIRECT_Pattern <ValueType vt, SI_INDIRECT_DST IndDst> {
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
 
   // 1. Extract with offset
   def : Pat<
@@ -2011,21 +2212,26 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, SI_INDIRECT_DST IndDst> {
 
   // 3. Insert with offset
   def : Pat<
-    (vector_insert vt:$vec, f32:$val, (add i32:$idx, imm:$off)),
+    (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
     (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val)
   >;
 
   // 4. Insert without offset
   def : Pat<
-    (vector_insert vt:$vec, f32:$val, i32:$idx),
+    (vector_insert vt:$vec, eltvt:$val, i32:$idx),
     (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val)
   >;
 }
 
-defm : SI_INDIRECT_Pattern <v2f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16f32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+
+defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
 
 /********** =============== **********/
 /**********   Conditions    **********/
@@ -2057,6 +2263,11 @@ def : Pat <
   (EXTRACT_SUBREG $a, sub0)
 >;
 
+def : Pat <
+  (i1 (trunc i32:$a)),
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1)
+>;
+
 // V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
 // case, the sgpr-copies pass will fix this to use the vector version.
 def : Pat <
@@ -2064,14 +2275,6 @@ def : Pat <
   (S_ADD_I32 $src0, $src1)
 >;
 
-def : Pat <
-  (or i64:$a, i64:$b),
-  (INSERT_SUBREG
-    (INSERT_SUBREG (IMPLICIT_DEF),
-      (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub0), (EXTRACT_SUBREG $b, sub0)), sub0),
-    (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub1), (EXTRACT_SUBREG $b, sub1)), sub1)
->;
-
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 7fcc964..00e32c0 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -38,6 +38,22 @@ let TargetPrefix = "SI", isTarget = 1 in {
      llvm_i32_ty],   // tfe(imm)
     []>;
 
+  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
+  def int_SI_buffer_load_dword : Intrinsic <
+    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+    [llvm_anyint_ty,  // rsrc(SGPR)
+     llvm_anyint_ty,  // vaddr(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // inst_offset(imm)
+     llvm_i32_ty,     // offen(imm)
+     llvm_i32_ty,     // idxen(imm)
+     llvm_i32_ty,     // glc(imm)
+     llvm_i32_ty,     // slc(imm)
+     llvm_i32_ty],    // tfe(imm)
+    [IntrReadArgMem]>;
+
+  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
   class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_sample : Sample;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 958763d..c2f8696 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -55,6 +55,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
 
 using namespace llvm;
 
@@ -67,7 +68,7 @@ private:
 
   static char ID;
   const TargetRegisterInfo *TRI;
-  const TargetInstrInfo *TII;
+  const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
 
@@ -145,7 +146,9 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
-  if (!shouldSkip(&MBB, &MBB.getParent()->back()))
+  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType !=
+      ShaderType::PIXEL ||
+      !shouldSkip(&MBB, &MBB.getParent()->back()))
     return;
 
   MachineBasicBlock::iterator Insert = &MI;
@@ -283,27 +286,36 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
 }
 
 void SILowerControlFlowPass::Branch(MachineInstr &MI) {
-  MachineBasicBlock *Next = MI.getParent()->getNextNode();
-  MachineBasicBlock *Target = MI.getOperand(0).getMBB();
-  if (Target == Next)
+  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
     MI.eraseFromParent();
-  else
-    assert(0);
+
+  // If these aren't equal, this is probably an infinite loop.
 }
 
 void SILowerControlFlowPass::Kill(MachineInstr &MI) {
-
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Op = MI.getOperand(0);
 
-  // Kill is only allowed in pixel shaders
+  // Kill is only allowed in pixel / geometry shaders
   assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
-         ShaderType::PIXEL);
-
-  // Clear this pixel from the exec mask if the operand is negative
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
-          .addImm(0)
-          .addOperand(MI.getOperand(0));
+         ShaderType::PIXEL ||
+         MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
+         ShaderType::GEOMETRY);
+
+  // Clear this thread from the exec mask if the operand is negative
+  if ((Op.isImm() || Op.isFPImm())) {
+    // Constant operand: Set exec mask to 0 or do nothing
+    if (Op.isImm() ? (Op.getImm() & 0x80000000) :
+        Op.getFPImm()->isNegative()) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+              .addImm(0);
+    }
+  } else {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+           .addImm(0)
+           .addOperand(Op);
+  }
 
   MI.eraseFromParent();
 }
@@ -333,12 +345,13 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
           .addReg(AMDGPU::EXEC);
 
   // Read the next variant into VCC (lower 32 bits) <- also loop target
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+          AMDGPU::VCC_LO)
           .addReg(Idx);
 
   // Move index from VCC into M0
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-          .addReg(AMDGPU::VCC);
+          .addReg(AMDGPU::VCC_LO);
 
   // Compare the just read M0 value to all possible Idx values
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
@@ -413,7 +426,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 }
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getTarget().getInstrInfo();
+  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
   TRI = MF.getTarget().getRegisterInfo();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -426,11 +439,16 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
        BI != BE; ++BI) {
 
     MachineBasicBlock &MBB = *BI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-         I != MBB.end(); I = Next) {
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
 
-      Next = llvm::next(I);
       MachineInstr &MI = *I;
+      if (TII->isDS(MI.getOpcode())) {
+        NeedM0 = true;
+        NeedWQM = true;
+      }
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -491,14 +509,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           IndirectDst(MI);
           break;
 
-        case AMDGPU::DS_READ_B32:
-          NeedWQM = true;
-          // Fall through
-        case AMDGPU::DS_WRITE_B32:
-        case AMDGPU::DS_ADD_U32_RTN:
-          NeedM0 = true;
-          break;
-
         case AMDGPU::V_INTERP_P1_F32:
         case AMDGPU::V_INTERP_P2_F32:
         case AMDGPU::V_INTERP_MOV_F32:
@@ -517,7 +527,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
             AMDGPU::M0).addImm(0xffffffff);
   }
 
-  if (NeedWQM && MFI->ShaderType != ShaderType::COMPUTE) {
+  if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index 071f9fa..ea04346 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,6 +10,10 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define MAX_LANES 64
 
 using namespace llvm;
 
@@ -19,4 +23,33 @@ void SIMachineFunctionInfo::anchor() {}
 
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
-    PSInputAddr(0) { }
+    PSInputAddr(0),
+    SpillTracker() { }
+
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
+  return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+}
+
+unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) {
+  if (!LaneVGPR) {
+    LaneVGPR = createLaneVGPR(MRI);
+  } else {
+    CurrentLane++;
+    if (CurrentLane == MAX_LANES) {
+      CurrentLane = 0;
+      LaneVGPR = createLaneVGPR(MRI);
+    }
+  }
+  return CurrentLane;
+}
+
+void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
+                                                           unsigned Reg,
+                                                           int Lane) {
+  SpilledRegisters[FrameIndex] = SpilledReg(Reg, Lane);
+}
+
+const SIMachineFunctionInfo::SpilledReg&
+SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
+  return SpilledRegisters[FrameIndex];
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 2f1961c..8dc82a0 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -16,16 +16,44 @@
 #define SIMACHINEFUNCTIONINFO_H_
 
 #include "AMDGPUMachineFunction.h"
+#include <map>
 
 namespace llvm {
 
+class MachineRegisterInfo;
+
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   virtual void anchor();
 public:
+
+  struct SpilledReg {
+    unsigned VGPR;
+    int Lane;
+    SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
+    SpilledReg() : VGPR(0), Lane(-1) { }
+    bool hasLane() { return Lane != -1;}
+  };
+
+  struct RegSpillTracker {
+  private:
+    unsigned CurrentLane;
+    std::map<unsigned, SpilledReg> SpilledRegisters;
+  public:
+    unsigned LaneVGPR;
+    RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
+    unsigned getNextLane(MachineRegisterInfo &MRI);
+    void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
+    const SpilledReg& getSpilledReg(unsigned FrameIndex);
+    bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
+  };
+
+  // SIMachineFunctionInfo definition
+
   SIMachineFunctionInfo(const MachineFunction &MF);
   unsigned PSInputAddr;
+  struct RegSpillTracker SpillTracker;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index ed0bbaf..6cef195 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -56,7 +56,7 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
 }
 
 unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
-  return getEncodingValue(Reg);
+  return getEncodingValue(Reg) & 0xff;
 }
 
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
@@ -122,7 +122,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     return RC;
 
   // If this register has a sub-register, we can safely assume it is a 32-bit
-  // register, becuase all of SI's sub-registers are 32-bit.
+  // register, because all of SI's sub-registers are 32-bit.
   if (isSGPRClass(RC)) {
     return &AMDGPU::SGPR_32RegClass;
   } else {
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 49bdbc9..65cf311 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -17,7 +17,16 @@ class SIReg <string n, bits<16> encoding = 0> : Register<n> {
 }
 
 // Special Registers
-def VCC : SIReg<"VCC", 106>;
+def VCC_LO : SIReg<"vcc_lo", 106>;
+def VCC_HI : SIReg<"vcc_hi", 107>;
+
+// VCC for 64-bit instructions
+def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 106;
+}
+
 def EXEC : SIReg<"EXEC", 126>;
 def SCC : SIReg<"SCC", 253>;
 def M0 : SIReg <"M0", 124>;
@@ -150,7 +159,7 @@ def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-  (add SGPR_32, M0Reg)
+  (add SGPR_32, M0Reg, VCC_LO)
 >;
 
 def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index f194d8b..9bf2caf 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -22,9 +22,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/InstVisitor.h"
+#include "llvm/IR/InstVisitor.h"
 
 using namespace llvm;
 
@@ -70,11 +69,11 @@ bool SITypeRewriter::runOnFunction(Function &F) {
     StringRef Str = A.getValueAsString();
     Str.getAsInteger(0, ShaderType);
   }
-  if (ShaderType != ShaderType::COMPUTE) {
-    visit(F);
-  }
+  if (ShaderType == ShaderType::COMPUTE)
+    return false;
 
   visit(F);
+  visit(F);
 
   return false;
 }
diff --git a/lib/Target/R600/TargetInfo/CMakeLists.txt b/lib/Target/R600/TargetInfo/CMakeLists.txt
index 3d1584e..c3bd26c 100644
--- a/lib/Target/R600/TargetInfo/CMakeLists.txt
+++ b/lib/Target/R600/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMR600Info
   AMDGPUTargetInfo.cpp
   )
-
-add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen)
diff --git a/lib/Target/R600/TargetInfo/LLVMBuild.txt b/lib/Target/R600/TargetInfo/LLVMBuild.txt
index 4c6fea4..c3d3cf5 100644
--- a/lib/Target/R600/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/R600/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = R600Info
 parent = R600
-required_libraries = MC Support
+required_libraries = Support
 add_to_library_groups = R600
diff --git a/lib/Target/Sparc/AsmParser/CMakeLists.txt b/lib/Target/Sparc/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..7579bfe
--- /dev/null
+++ b/lib/Target/Sparc/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMSparcAsmParser
+  SparcAsmParser.cpp
+  )
diff --git a/lib/Target/Sparc/AsmParser/LLVMBuild.txt b/lib/Target/Sparc/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000..c3ddf5a
--- /dev/null
+++ b/lib/Target/Sparc/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Sparc/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SparcAsmParser
+parent = Sparc
+required_libraries = MC MCParser Support SparcDesc SparcInfo
+add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/AsmParser/Makefile b/lib/Target/Sparc/AsmParser/Makefile
new file mode 100644
index 0000000..46b3e45
--- /dev/null
+++ b/lib/Target/Sparc/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/Sparc/AsmParser/Makefile ------------------*- Makefile-*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcAsmParser
+
+# Hack: we need to include 'main' Sparc target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
new file mode 100644
index 0000000..2ff6cdd
--- /dev/null
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -0,0 +1,951 @@
+//===-- SparcAsmParser.cpp - Parse Sparc assembly to MCInst instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+  namespace Sparc {
+    using namespace SP;
+  }
+}
+
+namespace {
+class SparcOperand;
+class SparcAsmParser : public MCTargetAsmParser {
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "SparcGenAsmMatcher.inc"
+
+  /// }
+
+  // public interface of the MCTargetAsmParser.
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseDirective(AsmToken DirectiveID);
+
+  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                              unsigned Kind);
+
+  // Custom parse functions for Sparc specific operands.
+  OperandMatchResultTy
+  parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+               StringRef Name);
+
+  OperandMatchResultTy
+  parseSparcAsmOperand(SparcOperand *&Operand, bool isCall = false);
+
+  OperandMatchResultTy
+  parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  // returns true if Tok is matched to a register and returns register in RegNo.
+  bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+                         unsigned &RegKind);
+
+  bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool is64Bit() const { return STI.getTargetTriple().startswith("sparcv9"); }
+public:
+  SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+};
+
+  static unsigned IntRegs[32] = {
+    Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
+    Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
+    Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3,
+    Sparc::O4, Sparc::O5, Sparc::O6, Sparc::O7,
+    Sparc::L0, Sparc::L1, Sparc::L2, Sparc::L3,
+    Sparc::L4, Sparc::L5, Sparc::L6, Sparc::L7,
+    Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3,
+    Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 };
+
+  static unsigned FloatRegs[32] = {
+    Sparc::F0,  Sparc::F1,  Sparc::F2,  Sparc::F3,
+    Sparc::F4,  Sparc::F5,  Sparc::F6,  Sparc::F7,
+    Sparc::F8,  Sparc::F9,  Sparc::F10, Sparc::F11,
+    Sparc::F12, Sparc::F13, Sparc::F14, Sparc::F15,
+    Sparc::F16, Sparc::F17, Sparc::F18, Sparc::F19,
+    Sparc::F20, Sparc::F21, Sparc::F22, Sparc::F23,
+    Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27,
+    Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 };
+
+  static unsigned DoubleRegs[32] = {
+    Sparc::D0,  Sparc::D1,  Sparc::D2,  Sparc::D3,
+    Sparc::D4,  Sparc::D5,  Sparc::D6,  Sparc::D7,
+    Sparc::D8,  Sparc::D7,  Sparc::D8,  Sparc::D9,
+    Sparc::D12, Sparc::D13, Sparc::D14, Sparc::D15,
+    Sparc::D16, Sparc::D17, Sparc::D18, Sparc::D19,
+    Sparc::D20, Sparc::D21, Sparc::D22, Sparc::D23,
+    Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27,
+    Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 };
+
+  static unsigned QuadFPRegs[32] = {
+    Sparc::Q0,  Sparc::Q1,  Sparc::Q2,  Sparc::Q3,
+    Sparc::Q4,  Sparc::Q5,  Sparc::Q6,  Sparc::Q7,
+    Sparc::Q8,  Sparc::Q9,  Sparc::Q10, Sparc::Q11,
+    Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 };
+
+
+/// SparcOperand - Instances of this class represent a parsed Sparc machine
+/// instruction.
+class SparcOperand : public MCParsedAsmOperand {
+public:
+  enum RegisterKind {
+    rk_None,
+    rk_IntReg,
+    rk_FloatReg,
+    rk_DoubleReg,
+    rk_QuadReg,
+    rk_CCReg,
+    rk_Y
+  };
+private:
+  enum KindTy {
+    k_Token,
+    k_Register,
+    k_Immediate,
+    k_MemoryReg,
+    k_MemoryImm
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    RegisterKind Kind;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned OffsetReg;
+    const MCExpr *Off;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+public:
+  bool isToken() const { return Kind == k_Token; }
+  bool isReg() const { return Kind == k_Register; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isMem() const { return isMEMrr() || isMEMri(); }
+  bool isMEMrr() const { return Kind == k_MemoryReg; }
+  bool isMEMri() const { return Kind == k_MemoryImm; }
+
+  bool isFloatReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_FloatReg);
+  }
+
+  bool isFloatOrDoubleReg() const {
+    return (Kind == k_Register && (Reg.Kind == rk_FloatReg
+                                   || Reg.Kind == rk_DoubleReg));
+  }
+
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  unsigned getMemOffsetReg() const {
+    assert((Kind == k_MemoryReg) && "Invalid access!");
+    return Mem.OffsetReg;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_MemoryImm) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const {
+    return StartLoc;
+  }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const {
+    return EndLoc;
+  }
+
+  virtual void print(raw_ostream &OS) const {
+    switch (Kind) {
+    case k_Token:     OS << "Token: " << getToken() << "\n"; break;
+    case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
+    case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
+    case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
+                         << getMemOffsetReg() << "\n"; break;
+    case k_MemoryImm: assert(getMemOff() != 0);
+      OS << "Mem: " << getMemBase()
+         << "+" << *getMemOff()
+         << "\n"; break;
+    }
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addMEMrrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    Inst.addOperand(MCOperand::CreateReg(getMemOffsetReg()));
+  }
+
+  void addMEMriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
+  static SparcOperand *CreateToken(StringRef Str, SMLoc S) {
+    SparcOperand *Op = new SparcOperand(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static SparcOperand *CreateReg(unsigned RegNum,
+                                 unsigned Kind,
+                                 SMLoc S, SMLoc E) {
+    SparcOperand *Op = new SparcOperand(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static SparcOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    SparcOperand *Op = new SparcOperand(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static SparcOperand *MorphToDoubleReg(SparcOperand *Op) {
+    unsigned Reg = Op->getReg();
+    assert(Op->Reg.Kind == rk_FloatReg);
+    unsigned regIdx = Reg - Sparc::F0;
+    if (regIdx % 2 || regIdx > 31)
+      return 0;
+    Op->Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op->Reg.Kind = rk_DoubleReg;
+    return Op;
+  }
+
+  static SparcOperand *MorphToQuadReg(SparcOperand *Op) {
+    unsigned Reg = Op->getReg();
+    unsigned regIdx = 0;
+    switch (Op->Reg.Kind) {
+    default: assert(0 && "Unexpected register kind!");
+    case rk_FloatReg:
+      regIdx = Reg - Sparc::F0;
+      if (regIdx % 4 || regIdx > 31)
+        return 0;
+      Reg = QuadFPRegs[regIdx / 4];
+      break;
+    case rk_DoubleReg:
+      regIdx =  Reg - Sparc::D0;
+      if (regIdx % 2 || regIdx > 31)
+        return 0;
+      Reg = QuadFPRegs[regIdx / 2];
+      break;
+    }
+    Op->Reg.RegNum  = Reg;
+    Op->Reg.Kind = rk_QuadReg;
+    return Op;
+  }
+
+  static SparcOperand *MorphToMEMrr(unsigned Base, SparcOperand *Op) {
+    unsigned offsetReg = Op->getReg();
+    Op->Kind = k_MemoryReg;
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = offsetReg;
+    Op->Mem.Off = 0;
+    return Op;
+  }
+
+  static SparcOperand *CreateMEMri(unsigned Base,
+                                 const MCExpr *Off,
+                                 SMLoc S, SMLoc E) {
+    SparcOperand *Op = new SparcOperand(k_MemoryImm);
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static SparcOperand *MorphToMEMri(unsigned Base, SparcOperand *Op) {
+    const MCExpr *Imm  = Op->getImm();
+    Op->Kind = k_MemoryImm;
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Off = Imm;
+    return Op;
+  }
+};
+
+} // end namespace
+
+bool SparcAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+  SmallVector<MCInst, 8> Instructions;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                              MatchingInlineAsm);
+  switch (MatchResult) {
+  default:
+    break;
+
+  case Match_Success: {
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+
+  case Match_MissingFeature:
+    return Error(IDLoc,
+                 "instruction requires a CPU feature not currently enabled");
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((SparcOperand*) Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction mnemonic");
+  }
+  return true;
+}
+
+bool SparcAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
+{
+  const AsmToken &Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  if (getLexer().getKind() != AsmToken::Percent)
+    return false;
+  Parser.Lex();
+  unsigned regKind = SparcOperand::rk_None;
+  if (matchRegisterName(Tok, RegNo, regKind)) {
+    Parser.Lex();
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
+                                 unsigned VariantID);
+
+bool SparcAsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                 SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands)
+{
+
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(SparcOperand::CreateToken(Name, NameLoc));
+
+  // apply mnemonic aliases, if any, so that we can parse operands correctly.
+  applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (getLexer().is(AsmToken::Comma)) {
+      if (parseBranchModifiers(Operands) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "unexpected token");
+      }
+    }
+    if (parseOperand(Operands, Name) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token");
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Name) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "unexpected token");
+      }
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool SparcAsmParser::
+ParseDirective(AsmToken DirectiveID)
+{
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".byte")
+    return parseDirectiveWord(1, DirectiveID.getLoc());
+
+  if (IDVal == ".half")
+    return parseDirectiveWord(2, DirectiveID.getLoc());
+
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, DirectiveID.getLoc());
+
+  if (IDVal == ".nword")
+    return parseDirectiveWord(is64Bit() ? 8 : 4, DirectiveID.getLoc());
+
+  if (is64Bit() && IDVal == ".xword")
+    return parseDirectiveWord(8, DirectiveID.getLoc());
+
+  if (IDVal == ".register") {
+    // For now, ignore .register directive.
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  // Let the MC layer to handle other directives.
+  return true;
+}
+
+bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+  Parser.Lex();
+  return false;
+}
+
+SparcAsmParser::OperandMatchResultTy SparcAsmParser::
+parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
+{
+
+  SMLoc S, E;
+  unsigned BaseReg = 0;
+
+  if (ParseRegister(BaseReg, S, E)) {
+    return MatchOperand_NoMatch;
+  }
+
+  switch (getLexer().getKind()) {
+  default: return MatchOperand_NoMatch;
+
+  case AsmToken::Comma:
+  case AsmToken::RBrac:
+  case AsmToken::EndOfStatement:
+    Operands.push_back(SparcOperand::CreateMEMri(BaseReg, 0, S, E));
+    return MatchOperand_Success;
+
+  case AsmToken:: Plus:
+    Parser.Lex(); // Eat the '+'
+    break;
+  case AsmToken::Minus:
+    break;
+  }
+
+  SparcOperand *Offset = 0;
+  OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
+  if (ResTy != MatchOperand_Success || !Offset)
+    return MatchOperand_NoMatch;
+
+  Offset = (Offset->isImm()
+            ? SparcOperand::MorphToMEMri(BaseReg, Offset)
+            : SparcOperand::MorphToMEMrr(BaseReg, Offset));
+
+  Operands.push_back(Offset);
+  return MatchOperand_Success;
+}
+
+SparcAsmParser::OperandMatchResultTy SparcAsmParser::
+parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+             StringRef Mnemonic)
+{
+
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+    return ResTy;
+
+  if (getLexer().is(AsmToken::LBrac)) {
+    // Memory operand
+    Operands.push_back(SparcOperand::CreateToken("[",
+                                                 Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the [
+
+    if (Mnemonic == "cas" || Mnemonic == "casx") {
+      SMLoc S = Parser.getTok().getLoc();
+      if (getLexer().getKind() != AsmToken::Percent)
+        return MatchOperand_NoMatch;
+      Parser.Lex(); // eat %
+
+      unsigned RegNo, RegKind;
+      if (!matchRegisterName(Parser.getTok(), RegNo, RegKind))
+        return MatchOperand_NoMatch;
+
+      Parser.Lex(); // Eat the identifier token.
+      SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer()-1);
+      Operands.push_back(SparcOperand::CreateReg(RegNo, RegKind, S, E));
+      ResTy = MatchOperand_Success;
+    } else {
+      ResTy = parseMEMOperand(Operands);
+    }
+
+    if (ResTy != MatchOperand_Success)
+      return ResTy;
+
+    if (!getLexer().is(AsmToken::RBrac))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(SparcOperand::CreateToken("]",
+                                                 Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the ]
+    return MatchOperand_Success;
+  }
+
+  SparcOperand *Op = 0;
+
+  ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
+  if (ResTy != MatchOperand_Success || !Op)
+    return MatchOperand_ParseFail;
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(Op);
+
+  return MatchOperand_Success;
+}
+
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
+{
+
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  Op = 0;
+  switch (getLexer().getKind()) {
+  default:  break;
+
+  case AsmToken::Percent:
+    Parser.Lex(); // Eat the '%'.
+    unsigned RegNo;
+    unsigned RegKind;
+    if (matchRegisterName(Parser.getTok(), RegNo, RegKind)) {
+      StringRef name = Parser.getTok().getString();
+      Parser.Lex(); // Eat the identifier token.
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      switch (RegNo) {
+      default:
+        Op = SparcOperand::CreateReg(RegNo, RegKind, S, E);
+        break;
+      case Sparc::Y:
+        Op = SparcOperand::CreateToken("%y", S);
+        break;
+
+      case Sparc::ICC:
+        if (name == "xcc")
+          Op = SparcOperand::CreateToken("%xcc", S);
+        else
+          Op = SparcOperand::CreateToken("%icc", S);
+        break;
+      }
+      break;
+    }
+    if (matchSparcAsmModifiers(EVal, E)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Op = SparcOperand::CreateImm(EVal, S, E);
+    }
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+    if (!getParser().parseExpression(EVal, E))
+      Op = SparcOperand::CreateImm(EVal, S, E);
+    break;
+
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (!getParser().parseIdentifier(Identifier)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
+
+      const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                  getContext());
+      if (isCall &&
+          getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
+        Res = SparcMCExpr::Create(SparcMCExpr::VK_Sparc_WPLT30, Res,
+                                  getContext());
+      Op = SparcOperand::CreateImm(Res, S, E);
+    }
+    break;
+  }
+  }
+  return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
+}
+
+SparcAsmParser::OperandMatchResultTy SparcAsmParser::
+parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  // parse (,a|,pn|,pt)+
+
+  while (getLexer().is(AsmToken::Comma)) {
+
+    Parser.Lex(); // Eat the comma
+
+    if (!getLexer().is(AsmToken::Identifier))
+      return MatchOperand_ParseFail;
+    StringRef modName = Parser.getTok().getString();
+    if (modName == "a" || modName == "pn" || modName == "pt") {
+      Operands.push_back(SparcOperand::CreateToken(modName,
+                                                   Parser.getTok().getLoc()));
+      Parser.Lex(); // eat the identifier.
+    }
+  }
+  return MatchOperand_Success;
+}
+
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
+                                       unsigned &RegNo,
+                                       unsigned &RegKind)
+{
+  int64_t intVal = 0;
+  RegNo = 0;
+  RegKind = SparcOperand::rk_None;
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef name = Tok.getString();
+
+    // %fp
+    if (name.equals("fp")) {
+      RegNo = Sparc::I6;
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %sp
+    if (name.equals("sp")) {
+      RegNo = Sparc::O6;
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+
+    if (name.equals("y")) {
+      RegNo = Sparc::Y;
+      RegKind = SparcOperand::rk_Y;
+      return true;
+    }
+
+    if (name.equals("icc")) {
+      RegNo = Sparc::ICC;
+      RegKind = SparcOperand::rk_CCReg;
+      return true;
+    }
+
+    if (name.equals("xcc")) {
+      // FIXME:: check 64bit.
+      RegNo = Sparc::ICC;
+      RegKind = SparcOperand::rk_CCReg;
+      return true;
+    }
+
+    // %fcc0 - %fcc3
+    if (name.substr(0, 3).equals_lower("fcc")
+        && !name.substr(3).getAsInteger(10, intVal)
+        && intVal < 4) {
+      // FIXME: check 64bit and  handle %fcc1 - %fcc3
+      RegNo = Sparc::FCC0 + intVal;
+      RegKind = SparcOperand::rk_CCReg;
+      return true;
+    }
+
+    // %g0 - %g7
+    if (name.substr(0, 1).equals_lower("g")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %o0 - %o7
+    if (name.substr(0, 1).equals_lower("o")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[8 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    if (name.substr(0, 1).equals_lower("l")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[16 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    if (name.substr(0, 1).equals_lower("i")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[24 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %f0 - %f31
+    if (name.substr(0, 1).equals_lower("f")
+        && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
+      RegNo = FloatRegs[intVal];
+      RegKind = SparcOperand::rk_FloatReg;
+      return true;
+    }
+    // %f32 - %f62
+    if (name.substr(0, 1).equals_lower("f")
+        && !name.substr(1, 2).getAsInteger(10, intVal)
+        && intVal >= 32 && intVal <= 62 && (intVal % 2 == 0)) {
+      // FIXME: Check V9
+      RegNo = DoubleRegs[intVal/2];
+      RegKind = SparcOperand::rk_DoubleReg;
+      return true;
+    }
+
+    // %r0 - %r31
+    if (name.substr(0, 1).equals_lower("r")
+        && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
+      RegNo = IntRegs[intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool hasGOTReference(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+      return hasGOTReference(SE->getSubExpr());
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    return hasGOTReference(BE->getLHS()) || hasGOTReference(BE->getRHS());
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    return (SymRef.getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_");
+  }
+
+  case MCExpr::Unary:
+    return hasGOTReference(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  }
+  return false;
+}
+
+bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
+                                            SMLoc &EndLoc)
+{
+  AsmToken Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return false;
+
+  StringRef name = Tok.getString();
+
+  SparcMCExpr::VariantKind VK = SparcMCExpr::parseVariantKind(name);
+
+  if (VK == SparcMCExpr::VK_Sparc_None)
+    return false;
+
+  Parser.Lex(); // Eat the identifier.
+  if (Parser.getTok().getKind() != AsmToken::LParen)
+    return false;
+
+  Parser.Lex(); // Eat the LParen token.
+  const MCExpr *subExpr;
+  if (Parser.parseParenExpression(subExpr, EndLoc))
+    return false;
+
+  bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_;
+
+  switch(VK) {
+  default: break;
+  case SparcMCExpr::VK_Sparc_LO:
+    VK =  (hasGOTReference(subExpr)
+           ? SparcMCExpr::VK_Sparc_PC10
+           : (isPIC ? SparcMCExpr::VK_Sparc_GOT10 : VK));
+    break;
+  case SparcMCExpr::VK_Sparc_HI:
+    VK =  (hasGOTReference(subExpr)
+           ? SparcMCExpr::VK_Sparc_PC22
+           : (isPIC ? SparcMCExpr::VK_Sparc_GOT22 : VK));
+    break;
+  }
+
+  EVal = SparcMCExpr::Create(VK, subExpr, getContext());
+  return true;
+}
+
+
+extern "C" void LLVMInitializeSparcAsmParser() {
+  RegisterMCAsmParser<SparcAsmParser> A(TheSparcTarget);
+  RegisterMCAsmParser<SparcAsmParser> B(TheSparcV9Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "SparcGenAsmMatcher.inc"
+
+
+
+unsigned SparcAsmParser::
+validateTargetOperandClass(MCParsedAsmOperand *GOp,
+                           unsigned Kind)
+{
+  SparcOperand *Op = (SparcOperand*)GOp;
+  if (Op->isFloatOrDoubleReg()) {
+    switch (Kind) {
+    default: break;
+    case MCK_DFPRegs:
+      if (!Op->isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
+        return MCTargetAsmParser::Match_Success;
+      break;
+    case MCK_QFPRegs:
+      if (SparcOperand::MorphToQuadReg(Op))
+        return MCTargetAsmParser::Match_Success;
+      break;
+    }
+  }
+  return Match_InvalidOperand;
+}
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index 6339394..cebda92 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -3,7 +3,10 @@ set(LLVM_TARGET_DEFINITIONS Sparc.td)
 tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM SparcGenCodeEmitter.inc -gen-emitter)
+tablegen(LLVM SparcGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM SparcGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM SparcGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM SparcGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM SparcGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM SparcGenCallingConv.inc -gen-callingconv)
@@ -23,9 +26,12 @@ add_llvm_target(SparcCodeGen
   SparcSelectionDAGInfo.cpp
   SparcJITInfo.cpp
   SparcCodeEmitter.cpp
+  SparcMCInstLower.cpp
+  SparcTargetObjectFile.cpp
   )
 
-add_dependencies(LLVMSparcCodeGen SparcCommonTableGen intrinsics_gen)
-
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
+add_subdirectory(InstPrinter)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 9a0466a..88fba39 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -55,15 +56,17 @@ namespace {
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
     bool runOnMachineFunction(MachineFunction &F) {
       bool Changed = false;
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot.
+      F.getRegInfo().invalidateLiveness();
+
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
         Changed |= runOnMachineBasicBlock(*FI);
       return Changed;
     }
 
-    bool isDelayFiller(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator candidate);
-
     void insertCallDefsUses(MachineBasicBlock::iterator MI,
                             SmallSet<unsigned, 32>& RegDefs,
                             SmallSet<unsigned, 32>& RegUses);
@@ -152,6 +155,10 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       assert (J != MBB.end() && "MI needs a delay instruction.");
       BuildMI(MBB, ++J, MI->getDebugLoc(),
               TII->get(SP::UNIMP)).addImm(structSize);
+      // Bundle the delay filler and unimp with the instruction.
+      MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), J);
+    } else {
+      MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), I);
     }
   }
   return Changed;
@@ -204,12 +211,8 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
     if (I->isDebugValue())
       continue;
 
-
-    if (I->hasUnmodeledSideEffects()
-        || I->isInlineAsm()
-        || I->isLabel()
-        || I->hasDelaySlot()
-        || isDelayFiller(MBB, I))
+    if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
+        I->hasDelaySlot() || I->isBundledWithSucc())
       break;
 
     if (delayHasHazard(I, sawLoad, sawStore, RegDefs, RegUses)) {
@@ -278,19 +281,19 @@ void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
   switch(MI->getOpcode()) {
   default: llvm_unreachable("Unknown opcode.");
   case SP::CALL: break;
-  case SP::JMPLrr:
-  case SP::JMPLri:
+  case SP::CALLrr:
+  case SP::CALLri:
     assert(MI->getNumOperands() >= 2);
     const MachineOperand &Reg = MI->getOperand(0);
-    assert(Reg.isReg() && "JMPL first operand is not a register.");
-    assert(Reg.isUse() && "JMPL first operand is not a use.");
+    assert(Reg.isReg() && "CALL first operand is not a register.");
+    assert(Reg.isUse() && "CALL first operand is not a use.");
     RegUses.insert(Reg.getReg());
 
     const MachineOperand &RegOrImm = MI->getOperand(1);
     if (RegOrImm.isImm())
         break;
-    assert(RegOrImm.isReg() && "JMPLrr second operand is not a register.");
-    assert(RegOrImm.isUse() && "JMPLrr second operand is not a use.");
+    assert(RegOrImm.isReg() && "CALLrr second operand is not a register.");
+    assert(RegOrImm.isUse() && "CALLrr second operand is not a use.");
     RegUses.insert(RegOrImm.getReg());
     break;
   }
@@ -332,18 +335,6 @@ bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
   return false;
 }
 
-// return true if the candidate is a delay filler.
-bool Filler::isDelayFiller(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator candidate)
-{
-  if (candidate == MBB.begin())
-    return false;
-  if (candidate->getOpcode() == SP::UNIMP)
-    return true;
-  --candidate;
-  return candidate->hasDelaySlot();
-}
-
 bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
 {
   if (!I->isCall())
@@ -353,8 +344,8 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   switch (I->getOpcode()) {
   default: llvm_unreachable("Unknown call opcode.");
   case SP::CALL: structSizeOpNum = 1; break;
-  case SP::JMPLrr:
-  case SP::JMPLri: structSizeOpNum = 2; break;
+  case SP::CALLrr:
+  case SP::CALLri: structSizeOpNum = 2; break;
   case SP::TLS_CALL: return false;
   }
 
@@ -484,10 +475,10 @@ bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
          && MBBI->getOperand(1).getReg() == SP::G0
          && MBBI->getOperand(2).getReg() == SP::G0);
 
-  MachineBasicBlock::iterator PrevInst = MBBI; --PrevInst;
+  MachineBasicBlock::iterator PrevInst = std::prev(MBBI);
 
-  // It cannot combine with a delay filler.
-  if (isDelayFiller(MBB, PrevInst))
+  // It cannot be combined with a bundled instruction.
+  if (PrevInst->isBundledWithSucc())
     return false;
 
   const TargetInstrInfo *TII = TM.getInstrInfo();
diff --git a/lib/Target/Sparc/Disassembler/CMakeLists.txt b/lib/Target/Sparc/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..7359c6a
--- /dev/null
+++ b/lib/Target/Sparc/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMSparcDisassembler
+  SparcDisassembler.cpp
+  )
diff --git a/lib/Target/Sparc/Disassembler/LLVMBuild.txt b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..e7387cd
--- /dev/null
+++ b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Sparc/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SparcDisassembler
+parent = Sparc
+required_libraries = MC Support SparcInfo
+add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/Disassembler/Makefile b/lib/Target/Sparc/Disassembler/Makefile
new file mode 100644
index 0000000..bc17ddc
--- /dev/null
+++ b/lib/Target/Sparc/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/Sparc/Disassembler/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcDisassembler
+
+# Hack: we need to include 'main' Sparc target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
new file mode 100644
index 0000000..5cd99d6
--- /dev/null
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -0,0 +1,483 @@
+//===- SparcDisassembler.cpp - Disassembler for Sparc -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Sparc Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sparc-disassembler"
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// SparcDisassembler - a disassembler class for Sparc.
+class SparcDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  SparcDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
+    MCDisassembler(STI), RegInfo(Info)
+  {}
+  virtual ~SparcDisassembler() {}
+
+  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
+
+  /// getInstruction - See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
+private:
+  OwningPtr<const MCRegisterInfo> RegInfo;
+};
+
+}
+
+namespace llvm {
+  extern Target TheSparcTarget, TheSparcV9Target;
+}
+
+static MCDisassembler *createSparcDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new SparcDisassembler(STI, T.createMCRegInfo(""));
+}
+
+
+extern "C" void LLVMInitializeSparcDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheSparcTarget,
+                                         createSparcDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheSparcV9Target,
+                                         createSparcDisassembler);
+}
+
+
+
+static const unsigned IntRegDecoderTable[] = {
+  SP::G0,  SP::G1,  SP::G2,  SP::G3,
+  SP::G4,  SP::G5,  SP::G6,  SP::G7,
+  SP::O0,  SP::O1,  SP::O2,  SP::O3,
+  SP::O4,  SP::O5,  SP::O6,  SP::O7,
+  SP::L0,  SP::L1,  SP::L2,  SP::L3,
+  SP::L4,  SP::L5,  SP::L6,  SP::L7,
+  SP::I0,  SP::I1,  SP::I2,  SP::I3,
+  SP::I4,  SP::I5,  SP::I6,  SP::I7 };
+
+static const unsigned FPRegDecoderTable[] = {
+  SP::F0,   SP::F1,   SP::F2,   SP::F3,
+  SP::F4,   SP::F5,   SP::F6,   SP::F7,
+  SP::F8,   SP::F9,   SP::F10,  SP::F11,
+  SP::F12,  SP::F13,  SP::F14,  SP::F15,
+  SP::F16,  SP::F17,  SP::F18,  SP::F19,
+  SP::F20,  SP::F21,  SP::F22,  SP::F23,
+  SP::F24,  SP::F25,  SP::F26,  SP::F27,
+  SP::F28,  SP::F29,  SP::F30,  SP::F31 };
+
+static const unsigned DFPRegDecoderTable[] = {
+  SP::D0,   SP::D16,  SP::D1,   SP::D17,
+  SP::D2,   SP::D18,  SP::D3,   SP::D19,
+  SP::D4,   SP::D20,  SP::D5,   SP::D21,
+  SP::D6,   SP::D22,  SP::D7,   SP::D23,
+  SP::D8,   SP::D24,  SP::D9,   SP::D25,
+  SP::D10,  SP::D26,  SP::D11,  SP::D27,
+  SP::D12,  SP::D28,  SP::D13,  SP::D29,
+  SP::D14,  SP::D30,  SP::D15,  SP::D31 };
+
+static const unsigned QFPRegDecoderTable[] = {
+  SP::Q0,  SP::Q8,   ~0U,  ~0U,
+  SP::Q1,  SP::Q9,   ~0U,  ~0U,
+  SP::Q2,  SP::Q10,  ~0U,  ~0U,
+  SP::Q3,  SP::Q11,  ~0U,  ~0U,
+  SP::Q4,  SP::Q12,  ~0U,  ~0U,
+  SP::Q5,  SP::Q13,  ~0U,  ~0U,
+  SP::Q6,  SP::Q14,  ~0U,  ~0U,
+  SP::Q7,  SP::Q15,  ~0U,  ~0U } ;
+
+static const unsigned FCCRegDecoderTable[] = {
+  SP::FCC0, SP::FCC1, SP::FCC2, SP::FCC3 };
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = FPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = DFPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = QFPRegDecoderTable[RegNo];
+  if (Reg == ~0U)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 3)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(FCCRegDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
+                                  uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn,
+                                 uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeJMPL(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeSWAP(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+
+#include "SparcGenDisassemblerTables.inc"
+
+/// readInstruction - read four bytes from the MemoryObject
+/// and return 32 bit word.
+static DecodeStatus readInstruction32(const MemoryObject &region,
+                                      uint64_t address,
+                                      uint64_t &size,
+                                      uint32_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, Bytes) == -1) {
+    size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a big-endian 32-bit word in the stream.
+  insn = (Bytes[3] <<  0) |
+    (Bytes[2] <<  8) |
+    (Bytes[1] << 16) |
+    (Bytes[0] << 24);
+
+  return MCDisassembler::Success;
+}
+
+
+DecodeStatus
+SparcDisassembler::getInstruction(MCInst &instr,
+                                 uint64_t &Size,
+                                 const MemoryObject &Region,
+                                 uint64_t Address,
+                                 raw_ostream &vStream,
+                                 raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size, Insn);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTableSparc32, instr, Insn, Address,
+                             this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &MI, unsigned insn, uint64_t Address,
+                              const void *Decoder,
+                              bool isLoad, DecodeFunc DecodeRD) {
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  bool isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  DecodeStatus status;
+  if (isLoad) {
+    status = DecodeRD(MI, rd, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  // Decode rs1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode imm|rs2.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  if (!isLoad) {
+    status = DecodeRD(MI, rd, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeQFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeQFPRegsRegisterClass);
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value,  bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn,
+                               uint64_t Address, const void *Decoder) {
+  unsigned tgt = fieldFromInstruction(insn, 0, 30);
+  tgt <<= 2;
+  if (!tryAddingSymbolicOperand(tgt+Address, false, Address,
+                                0, 30, MI, Decoder))
+    MI.addOperand(MCOperand::CreateImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned tgt = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  MI.addOperand(MCOperand::CreateImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJMPL(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RD.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const void *Decoder) {
+
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RS1.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS2 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RD.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::CreateImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Sparc/InstPrinter/CMakeLists.txt b/lib/Target/Sparc/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..a285a83
--- /dev/null
+++ b/lib/Target/Sparc/InstPrinter/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMSparcAsmPrinter
+  SparcInstPrinter.cpp
+  )
diff --git a/lib/Target/Sparc/InstPrinter/LLVMBuild.txt b/lib/Target/Sparc/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000..b4c8802
--- /dev/null
+++ b/lib/Target/Sparc/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Sparc/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SparcAsmPrinter
+parent = Sparc
+required_libraries = MC Support
+add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/InstPrinter/Makefile b/lib/Target/Sparc/InstPrinter/Makefile
new file mode 100644
index 0000000..2dabd82
--- /dev/null
+++ b/lib/Target/Sparc/InstPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/Sparc/InstPrinter/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcAsmPrinter
+
+# Hack: we need to include 'main'  target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
new file mode 100644
index 0000000..fabc125
--- /dev/null
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -0,0 +1,176 @@
+//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "SparcInstPrinter.h"
+#include "Sparc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+namespace Sparc {
+  using namespace SP;
+}
+}
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "SparcGenAsmWriter.inc"
+
+bool SparcInstPrinter::isV9() const {
+  return (STI.getFeatureBits() & Sparc::FeatureV9) != 0;
+}
+
+void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const
+{
+  OS << '%' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void SparcInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot)
+{
+  if (!printAliasInstr(MI, O) && !printSparcAliasInstr(MI, O))
+    printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, raw_ostream &O)
+{
+  switch (MI->getOpcode()) {
+  default: return false;
+  case SP::JMPLrr:
+  case SP::JMPLri: {
+    if (MI->getNumOperands() != 3)
+      return false;
+    if (!MI->getOperand(0).isReg())
+      return false;
+    switch (MI->getOperand(0).getReg()) {
+    default: return false;
+    case SP::G0: // jmp $addr | ret | retl
+      if (MI->getOperand(2).isImm() &&
+          MI->getOperand(2).getImm() == 8) {
+        switch(MI->getOperand(1).getReg()) {
+        default: break;
+        case SP::I7: O << "\tret"; return true;
+        case SP::O7: O << "\tretl"; return true;
+        }
+      }
+      O << "\tjmp "; printMemOperand(MI, 1, O);
+      return true;
+    case SP::O7: // call $addr
+      O << "\tcall "; printMemOperand(MI, 1, O);
+      return true;
+    }
+  }
+  case SP::V9FCMPS:  case SP::V9FCMPD:  case SP::V9FCMPQ:
+  case SP::V9FCMPES: case SP::V9FCMPED: case SP::V9FCMPEQ: {
+    if (isV9()
+        || (MI->getNumOperands() != 3)
+        || (!MI->getOperand(0).isReg())
+        || (MI->getOperand(0).getReg() != SP::FCC0))
+      return false;
+    // if V8, skip printing %fcc0.
+    switch(MI->getOpcode()) {
+    default:
+    case SP::V9FCMPS:  O << "\tfcmps "; break;
+    case SP::V9FCMPD:  O << "\tfcmpd "; break;
+    case SP::V9FCMPQ:  O << "\tfcmpq "; break;
+    case SP::V9FCMPES: O << "\tfcmpes "; break;
+    case SP::V9FCMPED: O << "\tfcmped "; break;
+    case SP::V9FCMPEQ: O << "\tfcmpeq "; break;
+    }
+    printOperand(MI, 1, O);
+    O << ", ";
+    printOperand(MI, 2, O);
+    return true;
+  }
+  }
+}
+
+void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
+                                    raw_ostream &O)
+{
+  const MCOperand &MO = MI->getOperand (opNum);
+
+  if (MO.isReg()) {
+    printRegName(O, MO.getReg());
+    return ;
+  }
+
+  if (MO.isImm()) {
+    O << (int)MO.getImm();
+    return;
+  }
+
+  assert(MO.isExpr() && "Unknown operand kind in printOperand");
+  MO.getExpr()->print(O);
+}
+
+void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier)
+{
+  printOperand(MI, opNum, O);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+    return;
+  }
+  const MCOperand &MO = MI->getOperand(opNum+1);
+
+  if (MO.isReg() && MO.getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MO.isImm() && MO.getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+
+  printOperand(MI, opNum+1, O);
+}
+
+void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
+                                     raw_ostream &O)
+{
+  int CC = (int)MI->getOperand(opNum).getImm();
+  switch (MI->getOpcode()) {
+  default: break;
+  case SP::FBCOND:
+  case SP::FBCONDA:
+  case SP::BPFCC:
+  case SP::BPFCCA:
+  case SP::BPFCCNT:
+  case SP::BPFCCANT:
+  case SP::MOVFCCrr:  case SP::V9MOVFCCrr:
+  case SP::MOVFCCri:  case SP::V9MOVFCCri:
+  case SP::FMOVS_FCC: case SP::V9FMOVS_FCC:
+  case SP::FMOVD_FCC: case SP::V9FMOVD_FCC:
+  case SP::FMOVQ_FCC: case SP::V9FMOVQ_FCC:
+    // Make sure CC is a fp conditional flag.
+    CC = (CC < 16) ? (CC + 16) : CC;
+    break;
+  }
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum,
+                                  raw_ostream &O)
+{
+  assert(0 && "FIXME: Implement SparcInstPrinter::printGetPCX.");
+  return true;
+}
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
new file mode 100644
index 0000000..45ee6c0
--- /dev/null
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -0,0 +1,52 @@
+//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SparcINSTPRINTER_H
+#define SparcINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class SparcInstPrinter : public MCInstPrinter {
+  const MCSubtargetInfo &STI;
+public:
+ SparcInstPrinter(const MCAsmInfo &MAI,
+                  const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI,
+                  const MCSubtargetInfo &sti)
+   : MCInstPrinter(MAI, MII, MRI), STI(sti) {}
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  bool printSparcAliasInstr(const MCInst *MI, raw_ostream &OS);
+  bool isV9() const;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, int opNum, raw_ostream &OS);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &OS,
+                       const char *Modifier = 0);
+  void printCCOperand(const MCInst *MI, int opNum, raw_ostream &OS);
+  bool printGetPCX(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/LLVMBuild.txt b/lib/Target/Sparc/LLVMBuild.txt
index fd8e5d9..765912b 100644
--- a/lib/Target/Sparc/LLVMBuild.txt
+++ b/lib/Target/Sparc/LLVMBuild.txt
@@ -16,19 +16,21 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = Sparc
 parent = Target
+has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 has_jit = 1
 
 [component_1]
 type = Library
 name = SparcCodeGen
 parent = Sparc
-required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcDesc
-                     SparcInfo Support Target
+required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcAsmPrinter
+                     SparcDesc SparcInfo Support Target
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
index 9d4db4d..c4dcdb5 100644
--- a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_llvm_library(LLVMSparcDesc
-  SparcMCTargetDesc.cpp
+  SparcAsmBackend.cpp
+  SparcELFObjectWriter.cpp
   SparcMCAsmInfo.cpp
+  SparcMCCodeEmitter.cpp
+  SparcMCTargetDesc.cpp
+  SparcMCExpr.cpp
+  SparcTargetStreamer.cpp
   )
-
-add_dependencies(LLVMSparcDesc SparcCommonTableGen)
diff --git a/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt b/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt
index 97f8f16..22515e6 100644
--- a/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/Sparc/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SparcDesc
 parent = Sparc
-required_libraries = MC SparcInfo Support
+required_libraries = MC SparcAsmPrinter SparcInfo Support
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
new file mode 100644
index 0000000..39c9996
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -0,0 +1,261 @@
+//===-- SparcAsmBackend.cpp - Sparc Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+
+  case Sparc::fixup_sparc_wplt30:
+  case Sparc::fixup_sparc_call30:
+    return (Value >> 2) & 0x3fffffff;
+
+  case Sparc::fixup_sparc_br22:
+    return (Value >> 2) & 0x3fffff;
+
+  case Sparc::fixup_sparc_br19:
+    return (Value >> 2) & 0x7ffff;
+
+  case Sparc::fixup_sparc_br16_2:
+    return (Value >> 2) & 0xc000;
+
+  case Sparc::fixup_sparc_br16_14:
+    return (Value >> 2) & 0x3fff;
+
+  case Sparc::fixup_sparc_pc22:
+  case Sparc::fixup_sparc_got22:
+  case Sparc::fixup_sparc_tls_gd_hi22:
+  case Sparc::fixup_sparc_tls_ldm_hi22:
+  case Sparc::fixup_sparc_tls_ie_hi22:
+  case Sparc::fixup_sparc_hi22:
+    return (Value >> 10) & 0x3fffff;
+
+  case Sparc::fixup_sparc_pc10:
+  case Sparc::fixup_sparc_got10:
+  case Sparc::fixup_sparc_tls_gd_lo10:
+  case Sparc::fixup_sparc_tls_ldm_lo10:
+  case Sparc::fixup_sparc_tls_ie_lo10:
+  case Sparc::fixup_sparc_lo10:
+    return Value & 0x3ff;
+
+  case Sparc::fixup_sparc_tls_ldo_hix22:
+  case Sparc::fixup_sparc_tls_le_hix22:
+    return (~Value >> 10) & 0x3fffff;
+
+  case Sparc::fixup_sparc_tls_ldo_lox10:
+  case Sparc::fixup_sparc_tls_le_lox10:
+    return (~(~Value & 0x3ff)) & 0x1fff;
+
+  case Sparc::fixup_sparc_h44:
+    return (Value >> 22) & 0x3fffff;
+
+  case Sparc::fixup_sparc_m44:
+    return (Value >> 12) & 0x3ff;
+
+  case Sparc::fixup_sparc_l44:
+    return Value & 0xfff;
+
+  case Sparc::fixup_sparc_hh:
+    return (Value >> 42) & 0x3fffff;
+
+  case Sparc::fixup_sparc_hm:
+    return (Value >> 32) & 0x3ff;
+
+  case Sparc::fixup_sparc_tls_gd_add:
+  case Sparc::fixup_sparc_tls_gd_call:
+  case Sparc::fixup_sparc_tls_ldm_add:
+  case Sparc::fixup_sparc_tls_ldm_call:
+  case Sparc::fixup_sparc_tls_ldo_add:
+  case Sparc::fixup_sparc_tls_ie_ld:
+  case Sparc::fixup_sparc_tls_ie_ldx:
+  case Sparc::fixup_sparc_tls_ie_add:
+    return 0;
+  }
+}
+
+namespace {
+  class SparcAsmBackend : public MCAsmBackend {
+    const Target &TheTarget;
+  public:
+    SparcAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
+
+    unsigned getNumFixupKinds() const {
+      return Sparc::NumTargetFixupKinds;
+    }
+
+    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+      const static MCFixupKindInfo Infos[Sparc::NumTargetFixupKinds] = {
+        // name                    offset bits  flags
+        { "fixup_sparc_call30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br22",      10,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br19",      13,     19,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_2",    10,      2,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_14",   18,     14,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_hi22",      10,     22,  0 },
+        { "fixup_sparc_lo10",      22,     10,  0 },
+        { "fixup_sparc_h44",       10,     22,  0 },
+        { "fixup_sparc_m44",       22,     10,  0 },
+        { "fixup_sparc_l44",       20,     12,  0 },
+        { "fixup_sparc_hh",        10,     22,  0 },
+        { "fixup_sparc_hm",        22,     10,  0 },
+        { "fixup_sparc_pc22",      10,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_pc10",      22,     10,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_got22",     10,     22,  0 },
+        { "fixup_sparc_got10",     22,     10,  0 },
+        { "fixup_sparc_wplt30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_tls_gd_hi22",   10, 22,  0 },
+        { "fixup_sparc_tls_gd_lo10",   22, 10,  0 },
+        { "fixup_sparc_tls_gd_add",     0,  0,  0 },
+        { "fixup_sparc_tls_gd_call",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_hi22",  10, 22,  0 },
+        { "fixup_sparc_tls_ldm_lo10",  22, 10,  0 },
+        { "fixup_sparc_tls_ldm_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_call",   0,  0,  0 },
+        { "fixup_sparc_tls_ldo_hix22", 10, 22,  0 },
+        { "fixup_sparc_tls_ldo_lox10", 22, 10,  0 },
+        { "fixup_sparc_tls_ldo_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ie_hi22",   10, 22,  0 },
+        { "fixup_sparc_tls_ie_lo10",   22, 10,  0 },
+        { "fixup_sparc_tls_ie_ld",      0,  0,  0 },
+        { "fixup_sparc_tls_ie_ldx",     0,  0,  0 },
+        { "fixup_sparc_tls_ie_add",     0,  0,  0 },
+        { "fixup_sparc_tls_le_hix22",   0,  0,  0 },
+        { "fixup_sparc_tls_le_lox10",   0,  0,  0 }
+      };
+
+      if (Kind < FirstTargetFixupKind)
+        return MCAsmBackend::getFixupKindInfo(Kind);
+
+      assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+             "Invalid kind!");
+      return Infos[Kind - FirstTargetFixupKind];
+    }
+
+    void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                           const MCFixup &Fixup, const MCFragment *DF,
+                           const MCValue &Target, uint64_t &Value,
+                           bool &IsResolved) override {
+      switch ((Sparc::Fixups)Fixup.getKind()) {
+      default: break;
+      case Sparc::fixup_sparc_wplt30:
+        if (Target.getSymA()->getSymbol().isTemporary())
+          return;
+      case Sparc::fixup_sparc_tls_gd_hi22:
+      case Sparc::fixup_sparc_tls_gd_lo10:
+      case Sparc::fixup_sparc_tls_gd_add:
+      case Sparc::fixup_sparc_tls_gd_call:
+      case Sparc::fixup_sparc_tls_ldm_hi22:
+      case Sparc::fixup_sparc_tls_ldm_lo10:
+      case Sparc::fixup_sparc_tls_ldm_add:
+      case Sparc::fixup_sparc_tls_ldm_call:
+      case Sparc::fixup_sparc_tls_ldo_hix22:
+      case Sparc::fixup_sparc_tls_ldo_lox10:
+      case Sparc::fixup_sparc_tls_ldo_add:
+      case Sparc::fixup_sparc_tls_ie_hi22:
+      case Sparc::fixup_sparc_tls_ie_lo10:
+      case Sparc::fixup_sparc_tls_ie_ld:
+      case Sparc::fixup_sparc_tls_ie_ldx:
+      case Sparc::fixup_sparc_tls_ie_add:
+      case Sparc::fixup_sparc_tls_le_hix22:
+      case Sparc::fixup_sparc_tls_le_lox10:  IsResolved = false; break;
+      }
+    }
+
+    bool mayNeedRelaxation(const MCInst &Inst) const {
+      // FIXME.
+      return false;
+    }
+
+    /// fixupNeedsRelaxation - Target specific predicate for whether a given
+    /// fixup requires the associated instruction to be relaxed.
+    bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                              uint64_t Value,
+                              const MCRelaxableFragment *DF,
+                              const MCAsmLayout &Layout) const {
+      // FIXME.
+      assert(0 && "fixupNeedsRelaxation() unimplemented");
+      return false;
+    }
+    void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+      // FIXME.
+      assert(0 && "relaxInstruction() unimplemented");
+    }
+
+    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+      // Cannot emit NOP with size not multiple of 32 bits.
+      if (Count % 4 != 0)
+        return false;
+
+      uint64_t NumNops = Count / 4;
+      for (uint64_t i = 0; i != NumNops; ++i)
+        OW->Write32(0x01000000);
+
+      return true;
+    }
+
+    bool is64Bit() const {
+      StringRef name = TheTarget.getName();
+      return name == "sparcv9";
+    }
+  };
+
+  class ELFSparcAsmBackend : public SparcAsmBackend {
+    Triple::OSType OSType;
+  public:
+    ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
+      SparcAsmBackend(T), OSType(OSType) { }
+
+    void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                    uint64_t Value, bool IsPCRel) const {
+
+      Value = adjustFixupValue(Fixup.getKind(), Value);
+      if (!Value) return;           // Doesn't change encoding.
+
+      unsigned Offset = Fixup.getOffset();
+
+      // For each byte of the fragment that the fixup touches, mask in the bits
+      // from the fixup value. The Value has been "split up" into the
+      // appropriate bitfields above.
+      for (unsigned i = 0; i != 4; ++i)
+        Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+
+    }
+
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+      uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
+      return createSparcELFObjectWriter(OS, is64Bit(), OSABI);
+    }
+  };
+
+} // end anonymous namespace
+
+
+MCAsmBackend *llvm::createSparcAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          StringRef TT,
+                                          StringRef CPU) {
+  return new ELFSparcAsmBackend(T, Triple(TT).getOS());
+}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
deleted file mode 100644
index f3caeaa..0000000
--- a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- SparcBaseInfo.h - Top level definitions for Sparc ---- --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions
-// for the Sparc target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core code gen
-// types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SPARCBASEINFO_H
-#define SPARCBASEINFO_H
-
-namespace llvm {
-
-/// SPII - This namespace holds target specific flags for instruction info.
-namespace SPII {
-
-/// Target Operand Flags. Sparc specific TargetFlags for MachineOperands and
-/// SDNodes.
-enum TOF {
-  MO_NO_FLAG,
-
-  // Extract the low 10 bits of an address.
-  // Assembler: %lo(addr)
-  MO_LO,
-
-  // Extract bits 31-10 of an address. Only for sethi.
-  // Assembler: %hi(addr) or %lm(addr)
-  MO_HI,
-
-  // Extract bits 43-22 of an adress. Only for sethi.
-  // Assembler: %h44(addr)
-  MO_H44,
-
-  // Extract bits 21-12 of an address.
-  // Assembler: %m44(addr)
-  MO_M44,
-
-  // Extract bits 11-0 of an address.
-  // Assembler: %l44(addr)
-  MO_L44,
-
-  // Extract bits 63-42 of an address. Only for sethi.
-  // Assembler: %hh(addr)
-  MO_HH,
-
-  // Extract bits 41-32 of an address.
-  // Assembler: %hm(addr)
-  MO_HM,
-
-  // TargetFlags for Thread Local Storage.
-  MO_TLS_GD_HI22,
-  MO_TLS_GD_LO10,
-  MO_TLS_GD_ADD,
-  MO_TLS_GD_CALL,
-  MO_TLS_LDM_HI22,
-  MO_TLS_LDM_LO10,
-  MO_TLS_LDM_ADD,
-  MO_TLS_LDM_CALL,
-  MO_TLS_LDO_HIX22,
-  MO_TLS_LDO_LOX10,
-  MO_TLS_LDO_ADD,
-  MO_TLS_IE_HI22,
-  MO_TLS_IE_LO10,
-  MO_TLS_IE_LD,
-  MO_TLS_IE_LDX,
-  MO_TLS_IE_ADD,
-  MO_TLS_LE_HIX22,
-  MO_TLS_LE_LOX10
-};
-
-} // end namespace SPII
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
new file mode 100644
index 0000000..5ba82f1
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -0,0 +1,112 @@
+//===-- SparcELFObjectWriter.cpp - Sparc ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class SparcELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    SparcELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+      : MCELFObjectTargetWriter(Is64Bit, OSABI,
+                                Is64Bit ?  ELF::EM_SPARCV9 : ELF::EM_SPARC,
+                                /*HasRelocationAddend*/ true) {}
+
+    virtual ~SparcELFObjectWriter() {}
+  protected:
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+  };
+}
+
+unsigned SparcELFObjectWriter::GetRelocType(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_R_DISP32)
+      return ELF::R_SPARC_DISP32;
+  }
+
+  if (IsPCRel) {
+    switch((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_1:                  return ELF::R_SPARC_DISP8;
+    case FK_Data_2:                  return ELF::R_SPARC_DISP16;
+    case FK_Data_4:                  return ELF::R_SPARC_DISP32;
+    case FK_Data_8:                  return ELF::R_SPARC_DISP64;
+    case Sparc::fixup_sparc_call30:  return ELF::R_SPARC_WDISP30;
+    case Sparc::fixup_sparc_br22:    return ELF::R_SPARC_WDISP22;
+    case Sparc::fixup_sparc_br19:    return ELF::R_SPARC_WDISP19;
+    case Sparc::fixup_sparc_pc22:    return ELF::R_SPARC_PC22;
+    case Sparc::fixup_sparc_pc10:    return ELF::R_SPARC_PC10;
+    case Sparc::fixup_sparc_wplt30:  return ELF::R_SPARC_WPLT30;
+    }
+  }
+
+  switch((unsigned)Fixup.getKind()) {
+  default:
+    llvm_unreachable("Unimplemented fixup -> relocation");
+  case FK_Data_1:                return ELF::R_SPARC_8;
+  case FK_Data_2:                return ((Fixup.getOffset() % 2)
+                                         ? ELF::R_SPARC_UA16
+                                         : ELF::R_SPARC_16);
+  case FK_Data_4:                return ((Fixup.getOffset() % 4)
+                                         ? ELF::R_SPARC_UA32
+                                         : ELF::R_SPARC_32);
+  case FK_Data_8:                return ((Fixup.getOffset() % 8)
+                                         ? ELF::R_SPARC_UA64
+                                         : ELF::R_SPARC_64);
+  case Sparc::fixup_sparc_hi22:  return ELF::R_SPARC_HI22;
+  case Sparc::fixup_sparc_lo10:  return ELF::R_SPARC_LO10;
+  case Sparc::fixup_sparc_h44:   return ELF::R_SPARC_H44;
+  case Sparc::fixup_sparc_m44:   return ELF::R_SPARC_M44;
+  case Sparc::fixup_sparc_l44:   return ELF::R_SPARC_L44;
+  case Sparc::fixup_sparc_hh:    return ELF::R_SPARC_HH22;
+  case Sparc::fixup_sparc_hm:    return ELF::R_SPARC_HM10;
+  case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
+  case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
+  case Sparc::fixup_sparc_tls_gd_hi22:   return ELF::R_SPARC_TLS_GD_HI22;
+  case Sparc::fixup_sparc_tls_gd_lo10:   return ELF::R_SPARC_TLS_GD_LO10;
+  case Sparc::fixup_sparc_tls_gd_add:    return ELF::R_SPARC_TLS_GD_ADD;
+  case Sparc::fixup_sparc_tls_gd_call:   return ELF::R_SPARC_TLS_GD_CALL;
+  case Sparc::fixup_sparc_tls_ldm_hi22:  return ELF::R_SPARC_TLS_LDM_HI22;
+  case Sparc::fixup_sparc_tls_ldm_lo10:  return ELF::R_SPARC_TLS_LDM_LO10;
+  case Sparc::fixup_sparc_tls_ldm_add:   return ELF::R_SPARC_TLS_LDM_ADD;
+  case Sparc::fixup_sparc_tls_ldm_call:  return ELF::R_SPARC_TLS_LDM_CALL;
+  case Sparc::fixup_sparc_tls_ldo_hix22: return ELF::R_SPARC_TLS_LDO_HIX22;
+  case Sparc::fixup_sparc_tls_ldo_lox10: return ELF::R_SPARC_TLS_LDO_LOX10;
+  case Sparc::fixup_sparc_tls_ldo_add:   return ELF::R_SPARC_TLS_LDO_ADD;
+  case Sparc::fixup_sparc_tls_ie_hi22:   return ELF::R_SPARC_TLS_IE_HI22;
+  case Sparc::fixup_sparc_tls_ie_lo10:   return ELF::R_SPARC_TLS_IE_LO10;
+  case Sparc::fixup_sparc_tls_ie_ld:     return ELF::R_SPARC_TLS_IE_LD;
+  case Sparc::fixup_sparc_tls_ie_ldx:    return ELF::R_SPARC_TLS_IE_LDX;
+  case Sparc::fixup_sparc_tls_ie_add:    return ELF::R_SPARC_TLS_IE_ADD;
+  case Sparc::fixup_sparc_tls_le_hix22:  return ELF::R_SPARC_TLS_LE_HIX22;
+  case Sparc::fixup_sparc_tls_le_lox10:  return ELF::R_SPARC_TLS_LE_LOX10;
+  }
+
+  return ELF::R_SPARC_NONE;
+}
+
+MCObjectWriter *llvm::createSparcELFObjectWriter(raw_ostream &OS,
+                                                 bool Is64Bit,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new SparcELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
new file mode 100644
index 0000000..d42bcee
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -0,0 +1,97 @@
+//===-- SparcFixupKinds.h - Sparc Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SPARC_FIXUPKINDS_H
+#define LLVM_SPARC_FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+  namespace Sparc {
+    enum Fixups {
+      // fixup_sparc_call30 - 30-bit PC relative relocation for call
+      fixup_sparc_call30 = FirstTargetFixupKind,
+
+      /// fixup_sparc_br22 - 22-bit PC relative relocation for
+      /// branches
+      fixup_sparc_br22,
+
+      /// fixup_sparc_br19 - 19-bit PC relative relocation for
+      /// branches on icc/xcc
+      fixup_sparc_br19,
+
+      /// fixup_sparc_bpr  - 16-bit fixup for bpr
+      fixup_sparc_br16_2,
+      fixup_sparc_br16_14,
+
+      /// fixup_sparc_hi22  - 22-bit fixup corresponding to %hi(foo)
+      /// for sethi
+      fixup_sparc_hi22,
+
+      /// fixup_sparc_lo10  - 10-bit fixup corresponding to %lo(foo)
+      fixup_sparc_lo10,
+
+      /// fixup_sparc_h44  - 22-bit fixup corresponding to %h44(foo)
+      fixup_sparc_h44,
+
+      /// fixup_sparc_m44  - 10-bit fixup corresponding to %m44(foo)
+      fixup_sparc_m44,
+
+      /// fixup_sparc_l44  - 12-bit fixup corresponding to %l44(foo)
+      fixup_sparc_l44,
+
+      /// fixup_sparc_hh  -  22-bit fixup corresponding to %hh(foo)
+      fixup_sparc_hh,
+
+      /// fixup_sparc_hm  -  10-bit fixup corresponding to %hm(foo)
+      fixup_sparc_hm,
+
+      /// fixup_sparc_pc22 - 22-bit fixup corresponding to %pc22(foo)
+      fixup_sparc_pc22,
+
+      /// fixup_sparc_pc10 - 10-bit fixup corresponding to %pc10(foo)
+      fixup_sparc_pc10,
+
+      /// fixup_sparc_got22 - 22-bit fixup corresponding to %got22(foo)
+      fixup_sparc_got22,
+
+      /// fixup_sparc_got10 - 10-bit fixup corresponding to %got10(foo)
+      fixup_sparc_got10,
+
+      /// fixup_sparc_wplt30
+      fixup_sparc_wplt30,
+
+      /// fixups for Thread Local Storage
+      fixup_sparc_tls_gd_hi22,
+      fixup_sparc_tls_gd_lo10,
+      fixup_sparc_tls_gd_add,
+      fixup_sparc_tls_gd_call,
+      fixup_sparc_tls_ldm_hi22,
+      fixup_sparc_tls_ldm_lo10,
+      fixup_sparc_tls_ldm_add,
+      fixup_sparc_tls_ldm_call,
+      fixup_sparc_tls_ldo_hix22,
+      fixup_sparc_tls_ldo_lox10,
+      fixup_sparc_tls_ldo_add,
+      fixup_sparc_tls_ie_hi22,
+      fixup_sparc_tls_ie_lo10,
+      fixup_sparc_tls_ie_ld,
+      fixup_sparc_tls_ie_ldx,
+      fixup_sparc_tls_ie_add,
+      fixup_sparc_tls_le_hix22,
+      fixup_sparc_tls_le_lox10,
+
+      // Marker
+      LastTargetFixupKind,
+      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index baac36b..ef5f8ce 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcMCAsmInfo.h"
+#include "SparcMCExpr.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCStreamer.h"
 
 using namespace llvm;
 
@@ -41,7 +43,31 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  PrivateGlobalPrefix = ".L";
+  if (TheTriple.getOS() == llvm::Triple::Solaris)
+    UseIntegratedAssembler = true;
 }
 
+const MCExpr*
+SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                               unsigned Encoding,
+                                               MCStreamer &Streamer) const {
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MCContext &Ctx = Streamer.getContext();
+    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::Create(Sym, Ctx), Ctx);
+  }
+
+  return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
+}
 
+const MCExpr*
+SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+                                       unsigned Encoding,
+                                       MCStreamer &Streamer) const {
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MCContext &Ctx = Streamer.getContext();
+    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::Create(Sym, Ctx), Ctx);
+  }
+  return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
+}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 1e58e37..d53d09d 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -17,13 +17,20 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class StringRef;
+class StringRef;
 
-  class SparcELFMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
-  public:
-    explicit SparcELFMCAsmInfo(StringRef TT);
-  };
+class SparcELFMCAsmInfo : public MCAsmInfoELF {
+  virtual void anchor();
+public:
+  explicit SparcELFMCAsmInfo(StringRef TT);
+  virtual const MCExpr* getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                    unsigned Encoding,
+                                                    MCStreamer &Streamer) const;
+  virtual const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
+                                            unsigned Encoding,
+                                            MCStreamer &Streamer) const;
+
+};
 
 } // namespace llvm
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
new file mode 100644
index 0000000..310fbd9
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -0,0 +1,217 @@
+//===-- SparcMCCodeEmitter.cpp - Convert Sparc code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SparcMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class SparcMCCodeEmitter : public MCCodeEmitter {
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const SparcMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  MCContext &Ctx;
+
+public:
+  SparcMCCodeEmitter(MCContext &ctx): Ctx(ctx) {}
+
+  ~SparcMCCodeEmitter() {}
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const;
+
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI,
+                                              MCContext &Ctx) {
+  return new SparcMCCodeEmitter(Ctx);
+}
+
+void SparcMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+  // Output the constant in big endian byte order.
+  for (unsigned i = 0; i != 4; ++i) {
+    OS << (char)(Bits >> 24);
+    Bits <<= 8;
+  }
+  unsigned tlsOpNo = 0;
+  switch (MI.getOpcode()) {
+  default: break;
+  case SP::TLS_CALL:   tlsOpNo = 1; break;
+  case SP::TLS_ADDrr:
+  case SP::TLS_ADDXrr:
+  case SP::TLS_LDrr:
+  case SP::TLS_LDXrr:  tlsOpNo = 3; break;
+  }
+  if (tlsOpNo != 0) {
+    const MCOperand &MO = MI.getOperand(tlsOpNo);
+    uint64_t op = getMachineOpValue(MI, MO, Fixups, STI);
+    assert(op == 0 && "Unexpected operand value!");
+    (void)op; // suppress warning.
+  }
+
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+
+unsigned SparcMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+  const MCExpr *Expr = MO.getExpr();
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+    MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+    return 0;
+  }
+
+  int64_t Res;
+  if (Expr->EvaluateAsAbsolute(Res))
+    return Res;
+
+  assert(0 && "Unhandled expression!");
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  if (MI.getOpcode() == SP::TLS_CALL) {
+    // No fixups for __tls_get_addr. Will emit for fixups for tls_symbol in
+    // EncodeInstruction.
+#ifndef NDEBUG
+    // Verify that the callee is actually __tls_get_addr.
+    const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr());
+    assert(SExpr && SExpr->getSubExpr()->getKind() == MCExpr::SymbolRef &&
+           "Unexpected expression in TLS_CALL");
+    const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(SExpr->getSubExpr());
+    assert(SymExpr->getSymbol().getName() == "__tls_get_addr" &&
+           "Unexpected function for TLS_CALL");
+#endif
+    return 0;
+  }
+
+  MCFixupKind fixupKind = (MCFixupKind)Sparc::fixup_sparc_call30;
+
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr())) {
+    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_WPLT30)
+      fixupKind = (MCFixupKind)Sparc::fixup_sparc_wplt30;
+  }
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), fixupKind));
+
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br22));
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br19));
+  return 0;
+}
+unsigned SparcMCCodeEmitter::
+getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br16_2));
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br16_14));
+
+  return 0;
+}
+
+
+
+#include "SparcGenMCCodeEmitter.inc"
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
new file mode 100644
index 0000000..e6b2aca
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -0,0 +1,252 @@
+//===-- SparcMCExpr.cpp - Sparc specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the Sparc architecture (e.g. "%hi", "%lo", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sparcmcexpr"
+#include "SparcMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ELF.h"
+
+
+using namespace llvm;
+
+const SparcMCExpr*
+SparcMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                      MCContext &Ctx) {
+    return new (Ctx) SparcMCExpr(Kind, Expr);
+}
+
+
+
+void SparcMCExpr::PrintImpl(raw_ostream &OS) const
+{
+
+  bool closeParen = printVariantKind(OS, Kind);
+
+  const MCExpr *Expr = getSubExpr();
+  Expr->print(OS);
+
+  if (closeParen)
+    OS << ')';
+}
+
+bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
+{
+  bool closeParen = true;
+  switch (Kind) {
+  case VK_Sparc_None:     closeParen = false; break;
+  case VK_Sparc_LO:       OS << "%lo(";  break;
+  case VK_Sparc_HI:       OS << "%hi(";  break;
+  case VK_Sparc_H44:      OS << "%h44("; break;
+  case VK_Sparc_M44:      OS << "%m44("; break;
+  case VK_Sparc_L44:      OS << "%l44("; break;
+  case VK_Sparc_HH:       OS << "%hh(";  break;
+  case VK_Sparc_HM:       OS << "%hm(";  break;
+    // FIXME: use %pc22/%pc10, if system assembler supports them.
+  case VK_Sparc_PC22:     OS << "%hi("; break;
+  case VK_Sparc_PC10:     OS << "%lo("; break;
+    // FIXME: use %got22/%got10, if system assembler supports them.
+  case VK_Sparc_GOT22:    OS << "%hi("; break;
+  case VK_Sparc_GOT10:    OS << "%lo("; break;
+  case VK_Sparc_WPLT30:   closeParen = false; break;
+  case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
+  case VK_Sparc_TLS_GD_HI22:   OS << "%tgd_hi22(";   break;
+  case VK_Sparc_TLS_GD_LO10:   OS << "%tgd_lo10(";   break;
+  case VK_Sparc_TLS_GD_ADD:    OS << "%tgd_add(";    break;
+  case VK_Sparc_TLS_GD_CALL:   OS << "%tgd_call(";   break;
+  case VK_Sparc_TLS_LDM_HI22:  OS << "%tldm_hi22(";  break;
+  case VK_Sparc_TLS_LDM_LO10:  OS << "%tldm_lo10(";  break;
+  case VK_Sparc_TLS_LDM_ADD:   OS << "%tldm_add(";   break;
+  case VK_Sparc_TLS_LDM_CALL:  OS << "%tldm_call(";  break;
+  case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; break;
+  case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; break;
+  case VK_Sparc_TLS_LDO_ADD:   OS << "%tldo_add(";   break;
+  case VK_Sparc_TLS_IE_HI22:   OS << "%tie_hi22(";   break;
+  case VK_Sparc_TLS_IE_LO10:   OS << "%tie_lo10(";   break;
+  case VK_Sparc_TLS_IE_LD:     OS << "%tie_ld(";     break;
+  case VK_Sparc_TLS_IE_LDX:    OS << "%tie_ldx(";    break;
+  case VK_Sparc_TLS_IE_ADD:    OS << "%tie_add(";    break;
+  case VK_Sparc_TLS_LE_HIX22:  OS << "%tle_hix22(";  break;
+  case VK_Sparc_TLS_LE_LOX10:  OS << "%tle_lox10(";  break;
+  }
+  return closeParen;
+}
+
+SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
+{
+  return StringSwitch<SparcMCExpr::VariantKind>(name)
+    .Case("lo",  VK_Sparc_LO)
+    .Case("hi",  VK_Sparc_HI)
+    .Case("h44", VK_Sparc_H44)
+    .Case("m44", VK_Sparc_M44)
+    .Case("l44", VK_Sparc_L44)
+    .Case("hh",  VK_Sparc_HH)
+    .Case("hm",  VK_Sparc_HM)
+    .Case("pc22",  VK_Sparc_PC22)
+    .Case("pc10",  VK_Sparc_PC10)
+    .Case("got22", VK_Sparc_GOT22)
+    .Case("got10", VK_Sparc_GOT10)
+    .Case("r_disp32",   VK_Sparc_R_DISP32)
+    .Case("tgd_hi22",   VK_Sparc_TLS_GD_HI22)
+    .Case("tgd_lo10",   VK_Sparc_TLS_GD_LO10)
+    .Case("tgd_add",    VK_Sparc_TLS_GD_ADD)
+    .Case("tgd_call",   VK_Sparc_TLS_GD_CALL)
+    .Case("tldm_hi22",  VK_Sparc_TLS_LDM_HI22)
+    .Case("tldm_lo10",  VK_Sparc_TLS_LDM_LO10)
+    .Case("tldm_add",   VK_Sparc_TLS_LDM_ADD)
+    .Case("tldm_call",  VK_Sparc_TLS_LDM_CALL)
+    .Case("tldo_hix22", VK_Sparc_TLS_LDO_HIX22)
+    .Case("tldo_lox10", VK_Sparc_TLS_LDO_LOX10)
+    .Case("tldo_add",   VK_Sparc_TLS_LDO_ADD)
+    .Case("tie_hi22",   VK_Sparc_TLS_IE_HI22)
+    .Case("tie_lo10",   VK_Sparc_TLS_IE_LO10)
+    .Case("tie_ld",     VK_Sparc_TLS_IE_LD)
+    .Case("tie_ldx",    VK_Sparc_TLS_IE_LDX)
+    .Case("tie_add",    VK_Sparc_TLS_IE_ADD)
+    .Case("tle_hix22",  VK_Sparc_TLS_LE_HIX22)
+    .Case("tle_lox10",  VK_Sparc_TLS_LE_LOX10)
+    .Default(VK_Sparc_None);
+}
+
+Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
+  switch (Kind) {
+  default:           assert(0 && "Unhandled SparcMCExpr::VariantKind");
+  case VK_Sparc_LO:       return Sparc::fixup_sparc_lo10;
+  case VK_Sparc_HI:       return Sparc::fixup_sparc_hi22;
+  case VK_Sparc_H44:      return Sparc::fixup_sparc_h44;
+  case VK_Sparc_M44:      return Sparc::fixup_sparc_m44;
+  case VK_Sparc_L44:      return Sparc::fixup_sparc_l44;
+  case VK_Sparc_HH:       return Sparc::fixup_sparc_hh;
+  case VK_Sparc_HM:       return Sparc::fixup_sparc_hm;
+  case VK_Sparc_PC22:     return Sparc::fixup_sparc_pc22;
+  case VK_Sparc_PC10:     return Sparc::fixup_sparc_pc10;
+  case VK_Sparc_GOT22:    return Sparc::fixup_sparc_got22;
+  case VK_Sparc_GOT10:    return Sparc::fixup_sparc_got10;
+  case VK_Sparc_WPLT30:   return Sparc::fixup_sparc_wplt30;
+  case VK_Sparc_TLS_GD_HI22:   return Sparc::fixup_sparc_tls_gd_hi22;
+  case VK_Sparc_TLS_GD_LO10:   return Sparc::fixup_sparc_tls_gd_lo10;
+  case VK_Sparc_TLS_GD_ADD:    return Sparc::fixup_sparc_tls_gd_add;
+  case VK_Sparc_TLS_GD_CALL:   return Sparc::fixup_sparc_tls_gd_call;
+  case VK_Sparc_TLS_LDM_HI22:  return Sparc::fixup_sparc_tls_ldm_hi22;
+  case VK_Sparc_TLS_LDM_LO10:  return Sparc::fixup_sparc_tls_ldm_lo10;
+  case VK_Sparc_TLS_LDM_ADD:   return Sparc::fixup_sparc_tls_ldm_add;
+  case VK_Sparc_TLS_LDM_CALL:  return Sparc::fixup_sparc_tls_ldm_call;
+  case VK_Sparc_TLS_LDO_HIX22: return Sparc::fixup_sparc_tls_ldo_hix22;
+  case VK_Sparc_TLS_LDO_LOX10: return Sparc::fixup_sparc_tls_ldo_lox10;
+  case VK_Sparc_TLS_LDO_ADD:   return Sparc::fixup_sparc_tls_ldo_add;
+  case VK_Sparc_TLS_IE_HI22:   return Sparc::fixup_sparc_tls_ie_hi22;
+  case VK_Sparc_TLS_IE_LO10:   return Sparc::fixup_sparc_tls_ie_lo10;
+  case VK_Sparc_TLS_IE_LD:     return Sparc::fixup_sparc_tls_ie_ld;
+  case VK_Sparc_TLS_IE_LDX:    return Sparc::fixup_sparc_tls_ie_ldx;
+  case VK_Sparc_TLS_IE_ADD:    return Sparc::fixup_sparc_tls_ie_add;
+  case VK_Sparc_TLS_LE_HIX22:  return Sparc::fixup_sparc_tls_le_hix22;
+  case VK_Sparc_TLS_LE_LOX10:  return Sparc::fixup_sparc_tls_le_lox10;
+  }
+}
+
+bool
+SparcMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                       const MCAsmLayout *Layout) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+
+}
+
+void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch(getKind()) {
+  default: return;
+  case VK_Sparc_TLS_GD_HI22:
+  case VK_Sparc_TLS_GD_LO10:
+  case VK_Sparc_TLS_GD_ADD:
+  case VK_Sparc_TLS_GD_CALL:
+  case VK_Sparc_TLS_LDM_HI22:
+  case VK_Sparc_TLS_LDM_LO10:
+  case VK_Sparc_TLS_LDM_ADD:
+  case VK_Sparc_TLS_LDM_CALL:
+  case VK_Sparc_TLS_LDO_HIX22:
+  case VK_Sparc_TLS_LDO_LOX10:
+  case VK_Sparc_TLS_LDO_ADD:
+  case VK_Sparc_TLS_IE_HI22:
+  case VK_Sparc_TLS_IE_LO10:
+  case VK_Sparc_TLS_IE_LD:
+  case VK_Sparc_TLS_IE_LDX:
+  case VK_Sparc_TLS_IE_ADD:
+  case VK_Sparc_TLS_LE_HIX22:
+  case VK_Sparc_TLS_LE_LOX10: break;
+  }
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that at least three other backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void SparcMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
new file mode 100644
index 0000000..be6526e
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -0,0 +1,111 @@
+//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Sparc-specific MCExprs, used for modifiers like
+// "%hi" or "%lo" etc.,
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SPARCMCEXPR_H
+#define LLVM_SPARCMCEXPR_H
+
+#include "SparcFixupKinds.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class StringRef;
+class SparcMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_Sparc_None,
+    VK_Sparc_LO,
+    VK_Sparc_HI,
+    VK_Sparc_H44,
+    VK_Sparc_M44,
+    VK_Sparc_L44,
+    VK_Sparc_HH,
+    VK_Sparc_HM,
+    VK_Sparc_PC22,
+    VK_Sparc_PC10,
+    VK_Sparc_GOT22,
+    VK_Sparc_GOT10,
+    VK_Sparc_WPLT30,
+    VK_Sparc_R_DISP32,
+    VK_Sparc_TLS_GD_HI22,
+    VK_Sparc_TLS_GD_LO10,
+    VK_Sparc_TLS_GD_ADD,
+    VK_Sparc_TLS_GD_CALL,
+    VK_Sparc_TLS_LDM_HI22,
+    VK_Sparc_TLS_LDM_LO10,
+    VK_Sparc_TLS_LDM_ADD,
+    VK_Sparc_TLS_LDM_CALL,
+    VK_Sparc_TLS_LDO_HIX22,
+    VK_Sparc_TLS_LDO_LOX10,
+    VK_Sparc_TLS_LDO_ADD,
+    VK_Sparc_TLS_IE_HI22,
+    VK_Sparc_TLS_IE_LO10,
+    VK_Sparc_TLS_IE_LD,
+    VK_Sparc_TLS_IE_LDX,
+    VK_Sparc_TLS_IE_ADD,
+    VK_Sparc_TLS_LE_HIX22,
+    VK_Sparc_TLS_LE_LOX10
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit SparcMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const SparcMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                 MCContext &Ctx);
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// getFixupKind - Get the fixup kind of this expression.
+  Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
+
+  /// @}
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const SparcMCExpr *) { return true; }
+
+  static VariantKind parseVariantKind(StringRef name);
+  static bool printVariantKind(raw_ostream &OS, VariantKind Kind);
+  static Sparc::Fixups getFixupKind(VariantKind Kind);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 1c64e1b..c69af56 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcMCTargetDesc.h"
+#include "InstPrinter/SparcInstPrinter.h"
 #include "SparcMCAsmInfo.h"
+#include "SparcTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -31,6 +33,25 @@
 
 using namespace llvm;
 
+
+static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+  unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+  unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 2047);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
 static MCInstrInfo *createSparcMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitSparcMCInstrInfo(X);
@@ -39,13 +60,16 @@ static MCInstrInfo *createSparcMCInstrInfo() {
 
 static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitSparcMCRegisterInfo(X, SP::I7);
+  InitSparcMCRegisterInfo(X, SP::O7);
   return X;
 }
 
 static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                    StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
+  Triple TheTriple(TT);
+  if (CPU.empty())
+    CPU = (TheTriple.getArch() == Triple::sparcv9) ? "v9" : "v8";
   InitSparcMCSubtargetInfo(X, TT, CPU, FS);
   return X;
 }
@@ -66,9 +90,13 @@ static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
-  // The default 32-bit code model is abs32/pic32.
-  if (CM == CodeModel::Default)
-    CM = RM == Reloc::PIC_ ? CodeModel::Medium : CodeModel::Small;
+  // The default 32-bit code model is abs32/pic32 and the default 32-bit
+  // code model for JIT is abs32.
+  switch (CM) {
+  default: break;
+  case CodeModel::Default:
+  case CodeModel::JITDefault: CM = CodeModel::Small; break;
+  }
 
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
@@ -79,17 +107,59 @@ static MCCodeGenInfo *createSparcV9MCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
-  // The default 64-bit code model is abs44/pic32.
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Medium;
+  // The default 64-bit code model is abs44/pic32 and the default 64-bit
+  // code model for JIT is abs64.
+  switch (CM) {
+  default:  break;
+  case CodeModel::Default:
+    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+    break;
+  case CodeModel::JITDefault:
+    CM = CodeModel::Large;
+    break;
+  }
 
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  MCStreamer *S =
+      createELFStreamer(Context, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  new SparcTargetELFStreamer(*S);
+  return S;
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                    MCAsmBackend *TAB, bool ShowInst) {
+
+  MCStreamer *S =
+      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
+                              InstPrint, CE, TAB, ShowInst);
+  new SparcTargetAsmStreamer(*S, OS);
+  return S;
+}
+
+static MCInstPrinter *createSparcMCInstPrinter(const Target &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI) {
+  return new SparcInstPrinter(MAI, MII, MRI, STI);
+}
+
 extern "C" void LLVMInitializeSparcTargetMC() {
   // Register the MC asm info.
-  RegisterMCAsmInfo<SparcELFMCAsmInfo> X(TheSparcTarget);
-  RegisterMCAsmInfo<SparcELFMCAsmInfo> Y(TheSparcV9Target);
+  RegisterMCAsmInfoFn X(TheSparcTarget, createSparcMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheSparcV9Target, createSparcV9MCAsmInfo);
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget,
@@ -99,11 +169,46 @@ extern "C" void LLVMInitializeSparcTargetMC() {
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheSparcV9Target, createSparcMCInstrInfo);
 
   // Register the MC register info.
   TargetRegistry::RegisterMCRegInfo(TheSparcTarget, createSparcMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheSparcV9Target,
+                                    createSparcMCRegisterInfo);
 
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
                                           createSparcMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheSparcV9Target,
+                                          createSparcMCSubtargetInfo);
+
+  // Register the MC Code Emitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheSparcTarget,
+                                        createSparcMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheSparcV9Target,
+                                        createSparcMCCodeEmitter);
+
+  //Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheSparcTarget,
+                                       createSparcAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheSparcV9Target,
+                                       createSparcAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheSparcTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheSparcV9Target,
+                                           createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheSparcTarget,
+                                      createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheSparcV9Target,
+                                      createMCAsmStreamer);
+
+  // Register the MCInstPrinter
+  TargetRegistry::RegisterMCInstPrinter(TheSparcTarget,
+                                        createSparcMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheSparcV9Target,
+                                        createSparcMCInstPrinter);
 }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index cba775a..c8029a8 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -14,12 +14,34 @@
 #ifndef SPARCMCTARGETDESC_H
 #define SPARCMCTARGETDESC_H
 
+#include "llvm/Support/DataTypes.h"
+
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
 class Target;
+class StringRef;
+class raw_ostream;
 
 extern Target TheSparcTarget;
 extern Target TheSparcV9Target;
 
+MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createSparcAsmBackend(const Target &T,
+                                    const MCRegisterInfo &MRI,
+                                    StringRef TT,
+                                    StringRef CPU);
+MCObjectWriter *createSparcELFObjectWriter(raw_ostream &OS,
+                                           bool Is64Bit,
+                                           uint8_t OSABI);
 } // End llvm namespace
 
 // Defines symbolic names for Sparc registers.  This defines a mapping from
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
new file mode 100644
index 0000000..94af791
--- /dev/null
+++ b/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- SparcTargetStreamer.cpp - Sparc Target Streamer Methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetStreamer.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+SparcTargetStreamer::SparcTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+void SparcTargetStreamer::anchor() {}
+
+SparcTargetAsmStreamer::SparcTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : SparcTargetStreamer(S), OS(OS) {}
+
+void SparcTargetAsmStreamer::emitSparcRegisterIgnore(unsigned reg) {
+  OS << "\t.register "
+     << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+     << ", #ignore\n";
+}
+
+void SparcTargetAsmStreamer::emitSparcRegisterScratch(unsigned reg) {
+  OS << "\t.register "
+     << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+     << ", #scratch\n";
+}
+
+SparcTargetELFStreamer::SparcTargetELFStreamer(MCStreamer &S)
+    : SparcTargetStreamer(S) {}
+
+MCELFStreamer &SparcTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
index c171db7..bcc0291 100644
--- a/lib/Target/Sparc/Makefile
+++ b/lib/Target/Sparc/Makefile
@@ -13,11 +13,12 @@ TARGET = Sparc
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = SparcGenRegisterInfo.inc SparcGenInstrInfo.inc \
-		SparcGenAsmWriter.inc SparcGenDAGISel.inc \
+		SparcGenAsmWriter.inc SparcGenAsmMatcher.inc \
+		SparcGenDAGISel.inc SparcGenDisassemblerTables.inc \
 		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc \
-		SparcGenCodeEmitter.inc
+		SparcGenCodeEmitter.inc SparcGenMCCodeEmitter.inc
 
-DIRS = TargetInfo MCTargetDesc
+DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index f44b604..de20aaa 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -23,12 +23,18 @@ namespace llvm {
   class FunctionPass;
   class SparcTargetMachine;
   class formatted_raw_ostream;
+  class AsmPrinter;
+  class MCInst;
+  class MachineInstr;
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
   FunctionPass *createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
                                               JITCodeEmitter &JCE);
 
+  void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+                                      MCInst &OutMI,
+                                      AsmPrinter &AP);
 } // end namespace llvm;
 
 namespace llvm {
@@ -36,8 +42,8 @@ namespace llvm {
   // values must be kept in sync with the ones in the .td file.
   namespace SPCC {
     enum CondCodes {
-      //ICC_A   =  8   ,  // Always
-      //ICC_N   =  0   ,  // Never
+      ICC_A   =  8   ,  // Always
+      ICC_N   =  0   ,  // Never
       ICC_NE  =  9   ,  // Not Equal
       ICC_E   =  1   ,  // Equal
       ICC_G   = 10   ,  // Greater
@@ -53,8 +59,8 @@ namespace llvm {
       ICC_VC  = 15   ,  // Overflow Clear
       ICC_VS  =  7   ,  // Overflow Set
 
-      //FCC_A   =  8+16,  // Always
-      //FCC_N   =  0+16,  // Never
+      FCC_A   =  8+16,  // Always
+      FCC_N   =  0+16,  // Never
       FCC_U   =  7+16,  // Unordered
       FCC_G   =  6+16,  // Greater
       FCC_UG  =  5+16,  // Unordered or Greater
@@ -74,6 +80,8 @@ namespace llvm {
 
   inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
     switch (CC) {
+    case SPCC::ICC_A:   return "a";
+    case SPCC::ICC_N:   return "n";
     case SPCC::ICC_NE:  return "ne";
     case SPCC::ICC_E:   return "e";
     case SPCC::ICC_G:   return "g";
@@ -88,6 +96,8 @@ namespace llvm {
     case SPCC::ICC_NEG: return "neg";
     case SPCC::ICC_VC:  return "vc";
     case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_A:   return "a";
+    case SPCC::FCC_N:   return "n";
     case SPCC::FCC_U:   return "u";
     case SPCC::FCC_G:   return "g";
     case SPCC::FCC_UG:  return "ug";
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 0df48f6..3159a46 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -29,11 +29,20 @@ def FeatureV8Deprecated
 def FeatureVIS
   : SubtargetFeature<"vis", "IsVIS", "true",
                      "Enable UltraSPARC Visual Instruction Set extensions">;
+def FeatureVIS2
+  : SubtargetFeature<"vis2", "IsVIS2", "true",
+                     "Enable Visual Instruction Set extensions II">;
+def FeatureVIS3
+  : SubtargetFeature<"vis3", "IsVIS3", "true",
+                     "Enable Visual Instruction Set extensions III">;
 
 def FeatureHardQuad
   : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
                      "Enable quad-word floating point instructions">;
 
+def UsePopc : SubtargetFeature<"popc", "UsePopc", "true",
+                               "Use the popc (population count) instruction">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -44,6 +53,10 @@ include "SparcInstrInfo.td"
 
 def SparcInstrInfo : InstrInfo;
 
+def SparcAsmParser : AsmParser {
+  bit ShouldEmitMatchRegisterName = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // SPARC processors supported.
 //===----------------------------------------------------------------------===//
@@ -52,6 +65,7 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
+def : Proc<"v7",              []>;
 def : Proc<"v8",              []>;
 def : Proc<"supersparc",      []>;
 def : Proc<"sparclite",       []>;
@@ -61,9 +75,17 @@ def : Proc<"sparclite86x",    []>;
 def : Proc<"sparclet",        []>;
 def : Proc<"tsc701",          []>;
 def : Proc<"v9",              [FeatureV9]>;
-def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated]>;
-def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated]>;
-def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+                               FeatureVIS2]>;
+def : Proc<"niagara",         [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+                               FeatureVIS2]>;
+def : Proc<"niagara2",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara3",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara4",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2, FeatureVIS3]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -73,4 +95,5 @@ def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
 def Sparc : Target {
   // Pull in Instruction Info:
   let InstructionSet = SparcInstrInfo;
+  let AssemblyParsers  = [SparcAsmParser];
 }
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index d06c894..50506a6 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -14,23 +14,33 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "Sparc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "MCTargetDesc/SparcMCExpr.h"
 #include "SparcInstrInfo.h"
 #include "SparcTargetMachine.h"
-#include "MCTargetDesc/SparcBaseInfo.h"
+#include "SparcTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 using namespace llvm;
 
 namespace {
   class SparcAsmPrinter : public AsmPrinter {
+    SparcTargetStreamer &getTargetStreamer() {
+      return static_cast<SparcTargetStreamer &>(
+          *OutStreamer.getTargetStreamer());
+    }
   public:
     explicit SparcAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer) {}
@@ -45,14 +55,12 @@ namespace {
     void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
     virtual void EmitFunctionBodyStart();
-    virtual void EmitInstruction(const MachineInstr *MI) {
-      SmallString<128> Str;
-      raw_svector_ostream OS(Str);
-      printInstruction(MI, OS);
-      OutStreamer.EmitRawText(OS.str());
+    virtual void EmitInstruction(const MachineInstr *MI);
+    virtual void EmitEndOfAsmFile(Module &M);
+
+    static const char *getRegisterName(unsigned RegNo) {
+      return SparcInstPrinter::getRegisterName(RegNo);
     }
-    void printInstruction(const MachineInstr *MI, raw_ostream &OS);// autogen'd.
-    static const char *getRegisterName(unsigned RegNo);
 
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
@@ -61,24 +69,211 @@ namespace {
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &O);
 
-    bool printGetPCX(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS);
-
-    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
-                       const;
-    void EmitGlobalRegisterDecl(unsigned reg) {
-      SmallString<128> Str;
-      raw_svector_ostream OS(Str);
-      OS << "\t.register "
-         << "%" << StringRef(getRegisterName(reg)).lower()
-         << ", "
-         << ((reg == SP::G6 || reg == SP::G7)? "#ignore" : "#scratch");
-      OutStreamer.EmitRawText(OS.str());
-    }
+    void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+                                   const MCSubtargetInfo &STI);
 
   };
 } // end of anonymous namespace
 
-#include "SparcGenAsmWriter.inc"
+static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
+                                      MCSymbol *Sym, MCContext &OutContext) {
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Sym,
+                                                         OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::Create(Kind, MCSym, OutContext);
+  return MCOperand::CreateExpr(expr);
+
+}
+static MCOperand createPCXCallOP(MCSymbol *Label,
+                                 MCContext &OutContext) {
+  return createSparcMCOperand(SparcMCExpr::VK_Sparc_None, Label, OutContext);
+}
+
+static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
+                                    MCSymbol *GOTLabel, MCSymbol *StartLabel,
+                                    MCSymbol *CurLabel,
+                                    MCContext &OutContext)
+{
+  const MCSymbolRefExpr *GOT = MCSymbolRefExpr::Create(GOTLabel, OutContext);
+  const MCSymbolRefExpr *Start = MCSymbolRefExpr::Create(StartLabel,
+                                                         OutContext);
+  const MCSymbolRefExpr *Cur = MCSymbolRefExpr::Create(CurLabel,
+                                                       OutContext);
+
+  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Cur, Start, OutContext);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(GOT, Sub, OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::Create(Kind,
+                                                Add, OutContext);
+  return MCOperand::CreateExpr(expr);
+}
+
+static void EmitCall(MCStreamer &OutStreamer,
+                     MCOperand &Callee,
+                     const MCSubtargetInfo &STI)
+{
+  MCInst CallInst;
+  CallInst.setOpcode(SP::CALL);
+  CallInst.addOperand(Callee);
+  OutStreamer.EmitInstruction(CallInst, STI);
+}
+
+static void EmitSETHI(MCStreamer &OutStreamer,
+                      MCOperand &Imm, MCOperand &RD,
+                      const MCSubtargetInfo &STI)
+{
+  MCInst SETHIInst;
+  SETHIInst.setOpcode(SP::SETHIi);
+  SETHIInst.addOperand(RD);
+  SETHIInst.addOperand(Imm);
+  OutStreamer.EmitInstruction(SETHIInst, STI);
+}
+
+static void EmitBinary(MCStreamer &OutStreamer, unsigned Opcode,
+                       MCOperand &RS1, MCOperand &Src2, MCOperand &RD,
+                       const MCSubtargetInfo &STI)
+{
+  MCInst Inst;
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(RD);
+  Inst.addOperand(RS1);
+  Inst.addOperand(Src2);
+  OutStreamer.EmitInstruction(Inst, STI);
+}
+
+static void EmitOR(MCStreamer &OutStreamer,
+                   MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+                   const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::ORri, RS1, Imm, RD, STI);
+}
+
+static void EmitADD(MCStreamer &OutStreamer,
+                    MCOperand &RS1, MCOperand &RS2, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::ADDrr, RS1, RS2, RD, STI);
+}
+
+static void EmitSHL(MCStreamer &OutStreamer,
+                    MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::SLLri, RS1, Imm, RD, STI);
+}
+
+
+static void EmitHiLo(MCStreamer &OutStreamer,  MCSymbol *GOTSym,
+                     SparcMCExpr::VariantKind HiKind,
+                     SparcMCExpr::VariantKind LoKind,
+                     MCOperand &RD,
+                     MCContext &OutContext,
+                     const MCSubtargetInfo &STI) {
+
+  MCOperand hi = createSparcMCOperand(HiKind, GOTSym, OutContext);
+  MCOperand lo = createSparcMCOperand(LoKind, GOTSym, OutContext);
+  EmitSETHI(OutStreamer, hi, RD, STI);
+  EmitOR(OutStreamer, RD, lo, RD, STI);
+}
+
+void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+                                                const MCSubtargetInfo &STI)
+{
+  MCSymbol *GOTLabel   =
+    OutContext.GetOrCreateSymbol(Twine("_GLOBAL_OFFSET_TABLE_"));
+
+  const MachineOperand &MO = MI->getOperand(0);
+  assert(MO.getReg() != SP::O7 &&
+         "%o7 is assigned as destination for getpcx!");
+
+  MCOperand MCRegOP = MCOperand::CreateReg(MO.getReg());
+
+
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    // Just load the address of GOT to MCRegOP.
+    switch(TM.getCodeModel()) {
+    default:
+      llvm_unreachable("Unsupported absolute code model");
+    case CodeModel::Small:
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+               MCRegOP, OutContext, STI);
+      break;
+    case CodeModel::Medium: {
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_H44, SparcMCExpr::VK_Sparc_M44,
+               MCRegOP, OutContext, STI);
+      MCOperand imm = MCOperand::CreateExpr(MCConstantExpr::Create(12,
+                                                                   OutContext));
+      EmitSHL(OutStreamer, MCRegOP, imm, MCRegOP, STI);
+      MCOperand lo = createSparcMCOperand(SparcMCExpr::VK_Sparc_L44,
+                                          GOTLabel, OutContext);
+      EmitOR(OutStreamer, MCRegOP, lo, MCRegOP, STI);
+      break;
+    }
+    case CodeModel::Large: {
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HH, SparcMCExpr::VK_Sparc_HM,
+               MCRegOP, OutContext, STI);
+      MCOperand imm = MCOperand::CreateExpr(MCConstantExpr::Create(32,
+                                                                   OutContext));
+      EmitSHL(OutStreamer, MCRegOP, imm, MCRegOP, STI);
+      // Use register %o7 to load the lower 32 bits.
+      MCOperand RegO7 = MCOperand::CreateReg(SP::O7);
+      EmitHiLo(OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+               RegO7, OutContext, STI);
+      EmitADD(OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+    }
+    }
+    return;
+  }
+
+  MCSymbol *StartLabel = OutContext.CreateTempSymbol();
+  MCSymbol *EndLabel   = OutContext.CreateTempSymbol();
+  MCSymbol *SethiLabel = OutContext.CreateTempSymbol();
+
+  MCOperand RegO7   = MCOperand::CreateReg(SP::O7);
+
+  // <StartLabel>:
+  //   call <EndLabel>
+  // <SethiLabel>:
+  //     sethi %hi(_GLOBAL_OFFSET_TABLE_+(<SethiLabel>-<StartLabel>)), <MO>
+  // <EndLabel>:
+  //   or  <MO>, %lo(_GLOBAL_OFFSET_TABLE_+(<EndLabel>-<StartLabel>))), <MO>
+  //   add <MO>, %o7, <MO>
+
+  OutStreamer.EmitLabel(StartLabel);
+  MCOperand Callee =  createPCXCallOP(EndLabel, OutContext);
+  EmitCall(OutStreamer, Callee, STI);
+  OutStreamer.EmitLabel(SethiLabel);
+  MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22,
+                                       GOTLabel, StartLabel, SethiLabel,
+                                       OutContext);
+  EmitSETHI(OutStreamer, hiImm, MCRegOP, STI);
+  OutStreamer.EmitLabel(EndLabel);
+  MCOperand loImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC10,
+                                       GOTLabel, StartLabel, EndLabel,
+                                       OutContext);
+  EmitOR(OutStreamer, MCRegOP, loImm, MCRegOP, STI);
+  EmitADD(OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+}
+
+void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
+{
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case TargetOpcode::DBG_VALUE:
+    // FIXME: Debug Value.
+    return;
+  case SP::GETPCX:
+    LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo());
+    return;
+  }
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  do {
+    MCInst TmpInst;
+    LowerSparcMachineInstrToMCInst(I, TmpInst, *this);
+    EmitToStreamer(OutStreamer, TmpInst);
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
+}
 
 void SparcAsmPrinter::EmitFunctionBodyStart() {
   if (!TM.getSubtarget<SparcSubtarget>().is64Bit())
@@ -90,89 +285,71 @@ void SparcAsmPrinter::EmitFunctionBodyStart() {
     unsigned reg = globalRegs[i];
     if (MRI.use_empty(reg))
       continue;
-    EmitGlobalRegisterDecl(reg);
+
+    if  (reg == SP::G6 || reg == SP::G7)
+      getTargetStreamer().emitSparcRegisterIgnore(reg);
+    else
+      getTargetStreamer().emitSparcRegisterScratch(reg);
   }
 }
 
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand (opNum);
-  unsigned TF = MO.getTargetFlags();
+  SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
+
 #ifndef NDEBUG
   // Verify the target flags.
   if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
     if (MI->getOpcode() == SP::CALL)
-      assert(TF == SPII::MO_NO_FLAG &&
+      assert(TF == SparcMCExpr::VK_Sparc_None &&
              "Cannot handle target flags on call address");
-    else if (MI->getOpcode() == SP::SETHIi)
-      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH
-              || TF == SPII::MO_TLS_GD_HI22
-              || TF == SPII::MO_TLS_LDM_HI22
-              || TF == SPII::MO_TLS_LDO_HIX22
-              || TF == SPII::MO_TLS_IE_HI22
-              || TF == SPII::MO_TLS_LE_HIX22) &&
+    else if (MI->getOpcode() == SP::SETHIi || MI->getOpcode() == SP::SETHIXi)
+      assert((TF == SparcMCExpr::VK_Sparc_HI
+              || TF == SparcMCExpr::VK_Sparc_H44
+              || TF == SparcMCExpr::VK_Sparc_HH
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDO_HIX22
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LE_HIX22) &&
              "Invalid target flags for address operand on sethi");
     else if (MI->getOpcode() == SP::TLS_CALL)
-      assert((TF == SPII::MO_NO_FLAG
-              || TF == SPII::MO_TLS_GD_CALL
-              || TF == SPII::MO_TLS_LDM_CALL) &&
+      assert((TF == SparcMCExpr::VK_Sparc_None
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_CALL
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_CALL) &&
              "Cannot handle target flags on tls call address");
     else if (MI->getOpcode() == SP::TLS_ADDrr)
-      assert((TF == SPII::MO_TLS_GD_ADD || TF == SPII::MO_TLS_LDM_ADD
-              || TF == SPII::MO_TLS_LDO_ADD || TF == SPII::MO_TLS_IE_ADD) &&
+      assert((TF == SparcMCExpr::VK_Sparc_TLS_GD_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDO_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_ADD) &&
              "Cannot handle target flags on add for TLS");
     else if (MI->getOpcode() == SP::TLS_LDrr)
-      assert(TF == SPII::MO_TLS_IE_LD &&
+      assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LD &&
              "Cannot handle target flags on ld for TLS");
     else if (MI->getOpcode() == SP::TLS_LDXrr)
-      assert(TF == SPII::MO_TLS_IE_LDX &&
+      assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LDX &&
              "Cannot handle target flags on ldx for TLS");
-    else if (MI->getOpcode() == SP::XORri)
-      assert((TF == SPII::MO_TLS_LDO_LOX10 || TF == SPII::MO_TLS_LE_LOX10) &&
+    else if (MI->getOpcode() == SP::XORri || MI->getOpcode() == SP::XORXri)
+      assert((TF == SparcMCExpr::VK_Sparc_TLS_LDO_LOX10
+              || TF == SparcMCExpr::VK_Sparc_TLS_LE_LOX10) &&
              "Cannot handle target flags on xor for TLS");
     else
-      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44
-              || TF == SPII::MO_HM
-              || TF == SPII::MO_TLS_GD_LO10
-              || TF == SPII::MO_TLS_LDM_LO10
-              || TF == SPII::MO_TLS_IE_LO10 ) &&
+      assert((TF == SparcMCExpr::VK_Sparc_LO
+              || TF == SparcMCExpr::VK_Sparc_M44
+              || TF == SparcMCExpr::VK_Sparc_L44
+              || TF == SparcMCExpr::VK_Sparc_HM
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_LO10
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_LO10
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_LO10 ) &&
              "Invalid target flags for small address operand");
   }
 #endif
 
-  bool CloseParen = true;
-  switch (TF) {
-  default:
-      llvm_unreachable("Unknown target flags on operand");
-  case SPII::MO_NO_FLAG:
-    CloseParen = false;
-    break;
-  case SPII::MO_LO:  O << "%lo(";  break;
-  case SPII::MO_HI:  O << "%hi(";  break;
-  case SPII::MO_H44: O << "%h44("; break;
-  case SPII::MO_M44: O << "%m44("; break;
-  case SPII::MO_L44: O << "%l44("; break;
-  case SPII::MO_HH:  O << "%hh(";  break;
-  case SPII::MO_HM:  O << "%hm(";  break;
-  case SPII::MO_TLS_GD_HI22:   O << "%tgd_hi22(";   break;
-  case SPII::MO_TLS_GD_LO10:   O << "%tgd_lo10(";   break;
-  case SPII::MO_TLS_GD_ADD:    O << "%tgd_add(";    break;
-  case SPII::MO_TLS_GD_CALL:   O << "%tgd_call(";   break;
-  case SPII::MO_TLS_LDM_HI22:  O << "%tldm_hi22(";  break;
-  case SPII::MO_TLS_LDM_LO10:  O << "%tldm_lo10(";  break;
-  case SPII::MO_TLS_LDM_ADD:   O << "%tldm_add(";   break;
-  case SPII::MO_TLS_LDM_CALL:  O << "%tldm_call(";  break;
-  case SPII::MO_TLS_LDO_HIX22: O << "%tldo_hix22("; break;
-  case SPII::MO_TLS_LDO_LOX10: O << "%tldo_lox10("; break;
-  case SPII::MO_TLS_LDO_ADD:   O << "%tldo_add(";   break;
-  case SPII::MO_TLS_IE_HI22:   O << "%tie_hi22(";   break;
-  case SPII::MO_TLS_IE_LO10:   O << "%tie_lo10(";   break;
-  case SPII::MO_TLS_IE_LD:     O << "%tie_ld(";     break;
-  case SPII::MO_TLS_IE_LDX:    O << "%tie_ldx(";    break;
-  case SPII::MO_TLS_IE_ADD:    O << "%tie_add(";    break;
-  case SPII::MO_TLS_LE_HIX22:  O << "%tle_hix22(";  break;
-  case SPII::MO_TLS_LE_LOX10:  O << "%tle_lox10(";   break;
-  }
+
+  bool CloseParen = SparcMCExpr::printVariantKind(O, TF);
 
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
@@ -195,7 +372,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << MO.getSymbolName();
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
       << MO.getIndex();
     break;
   default:
@@ -226,46 +403,6 @@ void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
   printOperand(MI, opNum+1, O);
 }
 
-bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum,
-                                  raw_ostream &O) {
-  std::string operand = "";
-  const MachineOperand &MO = MI->getOperand(opNum);
-  switch (MO.getType()) {
-  default: llvm_unreachable("Operand is not a register");
-  case MachineOperand::MO_Register:
-    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
-           "Operand is not a physical register ");
-    assert(MO.getReg() != SP::O7 &&
-           "%o7 is assigned as destination for getpcx!");
-    operand = "%" + StringRef(getRegisterName(MO.getReg())).lower();
-    break;
-  }
-
-  unsigned mfNum = MI->getParent()->getParent()->getFunctionNumber();
-  unsigned bbNum = MI->getParent()->getNumber();
-
-  O << '\n' << ".LLGETPCH" << mfNum << '_' << bbNum << ":\n";
-  O << "\tcall\t.LLGETPC" << mfNum << '_' << bbNum << '\n' ;
-
-  O << "\t  sethi\t"
-    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
-    << ")), "  << operand << '\n' ;
-
-  O << ".LLGETPC" << mfNum << '_' << bbNum << ":\n" ;
-  O << "\tor\t" << operand
-    << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
-    << ")), " << operand << '\n';
-  O << "\tadd\t" << operand << ", %o7, " << operand << '\n';
-
-  return true;
-}
-
-void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum,
-                                     raw_ostream &O) {
-  int CC = (int)MI->getOperand(opNum).getImm();
-  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
-}
-
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
 ///
 bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -303,35 +440,21 @@ bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-/// isBlockOnlyReachableByFallthough - Return true if the basic block has
-/// exactly one predecessor and the control transfer mechanism between
-/// the predecessor and this block is a fall-through.
-///
-/// This overrides AsmPrinter's implementation to handle delay slots.
-bool SparcAsmPrinter::
-isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
-  // If this is a landing pad, it isn't a fall through.  If it has no preds,
-  // then nothing falls through to it.
-  if (MBB->isLandingPad() || MBB->pred_empty())
-    return false;
-
-  // If there isn't exactly one predecessor, it can't be a fall through.
-  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
-  ++PI2;
-  if (PI2 != MBB->pred_end())
-    return false;
-
-  // The predecessor has to be immediately before this block.
-  const MachineBasicBlock *Pred = *PI;
-
-  if (!Pred->isLayoutSuccessor(MBB))
-    return false;
-
-  // Check if the last terminator is an unconditional branch.
-  MachineBasicBlock::const_iterator I = Pred->end();
-  while (I != Pred->begin() && !(--I)->isTerminator())
-    ; // Noop
-  return I == Pred->end() || !I->isBarrier();
+void SparcAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  const TargetLoweringObjectFileELF &TLOFELF =
+    static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+  MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+  // Generate stubs for global variables.
+  MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(TLOFELF.getDataSection());
+    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      OutStreamer.EmitLabel(Stubs[i].first);
+      OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), PtrSize);
+    }
+  }
 }
 
 // Force static initialization.
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index acd4ec2..dfaaabf 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -103,7 +103,7 @@ def RetCC_Sparc32 : CallingConv<[
 // Function return values are passed exactly like function arguments, except a
 // struct up to 32 bytes in size can be returned in registers.
 
-// Function arguments AND return values.
+// Function arguments AND most return values.
 def CC_Sparc64 : CallingConv<[
   // The frontend uses the inreg flag to indicate i32 and float arguments from
   // structs. These arguments are not promoted to 64 bits, but they can still
@@ -118,6 +118,15 @@ def CC_Sparc64 : CallingConv<[
   CCCustom<"CC_Sparc64_Full">
 ]>;
 
+def RetCC_Sparc64 : CallingConv<[
+  // A single f32 return value always goes in %f0. The ABI doesn't specify what
+  // happens to multiple f32 return values outside a struct.
+  CCIfType<[f32], CCCustom<"CC_Sparc64_Half">>,
+
+  // Otherwise, return values are passed exactly like arguments.
+  CCDelegateTo<CC_Sparc64>
+]>;
+
 // Callee-saved registers are handled by the register window mechanism.
 def CSR : CalleeSavedRegs<(add)> {
   let OtherPreserved = (add (sequence "I%u", 0, 7),
diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
index 9bfe31f..4f8d477 100644
--- a/lib/Target/Sparc/SparcCodeEmitter.cpp
+++ b/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -14,7 +14,7 @@
 
 #define DEBUG_TYPE "jit"
 #include "Sparc.h"
-#include "MCTargetDesc/SparcBaseInfo.h"
+#include "MCTargetDesc/SparcMCExpr.h"
 #include "SparcRelocations.h"
 #include "SparcTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
@@ -72,6 +72,15 @@ private:
   unsigned getMachineOpValue(const MachineInstr &MI,
                              const MachineOperand &MO) const;
 
+  unsigned getCallTargetOpValue(const MachineInstr &MI,
+                                unsigned) const;
+  unsigned getBranchTargetOpValue(const MachineInstr &MI,
+                                  unsigned) const;
+  unsigned getBranchPredTargetOpValue(const MachineInstr &MI,
+                                      unsigned) const;
+  unsigned getBranchOnRegTargetOpValue(const MachineInstr &MI,
+                                       unsigned) const;
+
   void emitWord(unsigned Word);
 
   unsigned getRelocation(const MachineInstr &MI,
@@ -136,7 +145,8 @@ void SparcCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
     }
     break;
   }
-  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::CFI_INSTRUCTION:
+    break;
   case TargetOpcode::EH_LABEL: {
     MCE.emitLabel(MI->getOperand(0).getMCSymbol());
     break;
@@ -181,20 +191,44 @@ unsigned SparcCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     llvm_unreachable("Unable to encode MachineOperand!");
   return 0;
 }
+unsigned SparcCodeEmitter::getCallTargetOpValue(const MachineInstr &MI,
+                                                unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
+                                                  unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getBranchPredTargetOpValue(const MachineInstr &MI,
+                                                      unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
+unsigned SparcCodeEmitter::getBranchOnRegTargetOpValue(const MachineInstr &MI,
+                                                       unsigned opIdx) const {
+  const MachineOperand MO = MI.getOperand(opIdx);
+  return getMachineOpValue(MI, MO);
+}
+
 unsigned SparcCodeEmitter::getRelocation(const MachineInstr &MI,
                                          const MachineOperand &MO) const {
 
   unsigned TF = MO.getTargetFlags();
   switch (TF) {
   default:
-  case SPII::MO_NO_FLAG: break;
-  case SPII::MO_LO: return SP::reloc_sparc_lo;
-  case SPII::MO_HI: return SP::reloc_sparc_hi;
-  case SPII::MO_H44:
-  case SPII::MO_M44:
-  case SPII::MO_L44:
-  case SPII::MO_HH:
-  case SPII::MO_HM: assert(0 && "FIXME: Implement Medium/Large code model.");
+  case SparcMCExpr::VK_Sparc_None:  break;
+  case SparcMCExpr::VK_Sparc_LO:    return SP::reloc_sparc_lo;
+  case SparcMCExpr::VK_Sparc_HI:    return SP::reloc_sparc_hi;
+  case SparcMCExpr::VK_Sparc_H44:   return SP::reloc_sparc_h44;
+  case SparcMCExpr::VK_Sparc_M44:   return SP::reloc_sparc_m44;
+  case SparcMCExpr::VK_Sparc_L44:   return SP::reloc_sparc_l44;
+  case SparcMCExpr::VK_Sparc_HH:    return SP::reloc_sparc_hh;
+  case SparcMCExpr::VK_Sparc_HM:    return SP::reloc_sparc_hm;
   }
 
   unsigned Opc = MI.getOpcode();
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index c75998a..d96a4c0 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -104,23 +104,23 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-  BuildMI(MBB, MBBI, dl, TII.get(SP::PROLOG_LABEL)).addSym(FrameLabel);
-
   unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
 
   // Emit ".cfi_def_cfa_register 30".
-  MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
-                                                          regFP));
+  unsigned CFIIndex =
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+
   // Emit ".cfi_window_save".
-  MMI.addFrameInst(MCCFIInstruction::createWindowSave(FrameLabel));
+  CFIIndex = MMI.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
 
   unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
   unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
   // Emit ".cfi_register 15, 31".
-  MMI.addFrameInst(MCCFIInstruction::createRegister(FrameLabel,
-                                                    regOutRA,
-                                                    regInRA));
+  CFIIndex = MMI.addFrameInst(
+      MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
 }
 
 void SparcFrameLowering::
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 64625f7..8e720ee 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -13,10 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcISelLowering.h"
+#include "MCTargetDesc/SparcMCExpr.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
-#include "MCTargetDesc/SparcBaseInfo.h"
+#include "SparcTargetObjectFile.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -80,11 +81,14 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
 static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
                             MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                             ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  assert((LocVT == MVT::f32 || LocVT.getSizeInBits() == 64) &&
+  assert((LocVT == MVT::f32 || LocVT == MVT::f128
+          || LocVT.getSizeInBits() == 64) &&
          "Can't handle non-64 bits locations");
 
   // Stack space is allocated for all arguments starting from [%fp+BIAS+128].
-  unsigned Offset = State.AllocateStack(8, 8);
+  unsigned size      = (LocVT == MVT::f128) ? 16 : 8;
+  unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
+  unsigned Offset = State.AllocateStack(size, alignment);
   unsigned Reg = 0;
 
   if (LocVT == MVT::i64 && Offset < 6*8)
@@ -96,6 +100,9 @@ static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
   else if (LocVT == MVT::f32 && Offset < 16*8)
     // Promote floats to %f1, %f3, ...
     Reg = SP::F1 + Offset/4;
+  else if (LocVT == MVT::f128 && Offset < 16*8)
+    // Promote long doubles to %q0-%q28. (Which LLVM calls Q0-Q7).
+    Reg = SP::Q0 + Offset/16;
 
   // Promote to register when possible, otherwise use the stack slot.
   if (Reg) {
@@ -248,7 +255,7 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
                  DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analyze return values.
-  CCInfo.AnalyzeReturn(Outs, CC_Sparc64);
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -265,6 +272,7 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
 
     // Integer return values must be sign or zero extended by the callee.
     switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
     case CCValAssign::SExt:
       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
       break;
@@ -273,8 +281,9 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
       break;
     case CCValAssign::AExt:
       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
-    default:
       break;
+    default:
+      llvm_unreachable("Unknown loc info!");
     }
 
     // The custom bit on an i32 return value indicates that it should be passed
@@ -888,10 +897,12 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
+  unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
+                 ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32, TF);
 
   // Returns a chain & a flag for retval copy to use
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -998,9 +1009,10 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
                                    ArrayRef<ISD::OutputArg> Outs) {
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     const CCValAssign &VA = ArgLocs[i];
+    MVT ValTy = VA.getLocVT();
     // FIXME: What about f32 arguments? C promotes them to f64 when calling
     // varargs functions.
-    if (!VA.isRegLoc() || VA.getLocVT() != MVT::f64)
+    if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
       continue;
     // The fixed arguments to a varargs function still go in FP registers.
     if (Outs[VA.getValNo()].IsFixed)
@@ -1010,15 +1022,25 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
     CCValAssign NewVA;
 
     // Determine the offset into the argument array.
-    unsigned Offset = 8 * (VA.getLocReg() - SP::D0);
+    unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
+    unsigned argSize  = (ValTy == MVT::f64) ? 8 : 16;
+    unsigned Offset = argSize * (VA.getLocReg() - firstReg);
     assert(Offset < 16*8 && "Offset out of range, bad register enum?");
 
     if (Offset < 6*8) {
       // This argument should go in %i0-%i5.
       unsigned IReg = SP::I0 + Offset/8;
-      // Full register, just bitconvert into i64.
-      NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
-                                  IReg, MVT::i64, CCValAssign::BCvt);
+      if (ValTy == MVT::f64)
+        // Full register, just bitconvert into i64.
+        NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
+                                    IReg, MVT::i64, CCValAssign::BCvt);
+      else {
+        assert(ValTy == MVT::f128 && "Unexpected type!");
+        // Full register, just bitconvert into i128 -- We will lower this into
+        // two i64s in LowerCall_64.
+        NewVA = CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(),
+                                          IReg, MVT::i128, CCValAssign::BCvt);
+      }
     } else {
       // This needs to go to memory, we're out of integer registers.
       NewVA = CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
@@ -1094,11 +1116,46 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      // fixupVariableFloatArgs() may create bitcasts from f128 to i128. But
+      // SPARC does not support i128 natively. Lower it into two i64, see below.
+      if (!VA.needsCustom() || VA.getValVT() != MVT::f128
+          || VA.getLocVT() != MVT::i128)
+        Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
     }
 
     if (VA.isRegLoc()) {
+      if (VA.needsCustom() && VA.getValVT() == MVT::f128
+          && VA.getLocVT() == MVT::i128) {
+        // Store and reload into the interger register reg and reg+1.
+        unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
+        unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
+        SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+        SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset);
+        HiPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                       HiPtrOff);
+        SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8);
+        LoPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                       LoPtrOff);
+
+        // Store to %sp+BIAS+128+Offset
+        SDValue Store = DAG.getStore(Chain, DL, Arg, HiPtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, 0);
+        // Load into Reg and Reg+1
+        SDValue Hi64 = DAG.getLoad(MVT::i64, DL, Store, HiPtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        SDValue Lo64 = DAG.getLoad(MVT::i64, DL, Store, LoPtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
+                                            Hi64));
+        RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
+                                            Lo64));
+        continue;
+      }
+
       // The custom bit on an i32 return value indicates that it should be
       // passed in the high bits of the register.
       if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
@@ -1156,10 +1213,13 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
   bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+  unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
+                 ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy());
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
+                                        TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy());
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), TF);
 
   // Build the operands for the call instruction itself.
   SmallVector<SDValue, 8> Ops;
@@ -1200,7 +1260,13 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
                  DAG.getTarget(), RVLocs, *DAG.getContext());
-  RVInfo.AnalyzeCallResult(CLI.Ins, CC_Sparc64);
+
+  // Set inreg flag manually for codegen generated library calls that
+  // return float.
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == 0)
+    CLI.Ins[0].Flags.setInReg();
+
+  RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -1303,7 +1369,7 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
 }
 
 SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
-  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  : TargetLowering(TM, new SparcELFTargetObjectFile()) {
   Subtarget = &TM.getSubtarget<SparcSubtarget>();
 
   // Set up the register classes.
@@ -1403,7 +1469,8 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::BR_CC, MVT::i64, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 
-    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+    setOperationAction(ISD::CTPOP, MVT::i64,
+                       Subtarget->usePopc() ? Legal : Expand);
     setOperationAction(ISD::CTTZ , MVT::i64, Expand);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
     setOperationAction(ISD::CTLZ , MVT::i64, Expand);
@@ -1414,9 +1481,29 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   }
 
-  // FIXME: There are instructions available for ATOMIC_FENCE
-  // on SparcV8 and later.
-  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
+  // ATOMICs.
+  // FIXME: We insert fences for each atomics and generate sub-optimal code
+  // for PSO/TSO. Also, implement other atomicrmw operations.
+
+  setInsertFencesForAtomic(true);
+
+  setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32,
+                     (Subtarget->isV9() ? Legal: Expand));
+
+
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
+
+  // Custom Lower Atomic LOAD/STORE
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
+  }
 
   if (!Subtarget->isV9()) {
     // SparcV8 does not have FNEGD and FABSD.
@@ -1439,7 +1526,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FMA  , MVT::f32, Expand);
-  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::CTTZ , MVT::i32, Expand);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::CTLZ , MVT::i32, Expand);
@@ -1467,6 +1553,13 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::MULHU,     MVT::i64, Expand);
     setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+
+    setOperationAction(ISD::UMULO,     MVT::i64, Custom);
+    setOperationAction(ISD::SMULO,     MVT::i64, Custom);
+
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   }
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
@@ -1474,6 +1567,8 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   // VAARG needs to be lowered to not do unaligned accesses for doubles.
   setOperationAction(ISD::VAARG             , MVT::Other, Custom);
 
+  setOperationAction(ISD::TRAP              , MVT::Other, Legal);
+
   // Use the default implementation.
   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
@@ -1486,8 +1581,8 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
 
   setStackPointerRegisterToSaveRestore(SP::O6);
 
-  if (Subtarget->isV9())
-    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i32,
+                     Subtarget->usePopc() ? Legal : Expand);
 
   if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
     setOperationAction(ISD::LOAD, MVT::f128, Legal);
@@ -1714,7 +1809,8 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   // Handle PIC mode first.
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
     // This is the pic32 code model, the GOT is known to be smaller than 4GB.
-    SDValue HiLo = makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
+    SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
+                                SparcMCExpr::VK_Sparc_GOT10, DAG);
     SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
     // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
@@ -1729,23 +1825,26 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   switch(getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
-  case CodeModel::JITDefault:
   case CodeModel::Small:
     // abs32.
-    return makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
+    return makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+                        SparcMCExpr::VK_Sparc_LO, DAG);
   case CodeModel::Medium: {
     // abs44.
-    SDValue H44 = makeHiLoPair(Op, SPII::MO_H44, SPII::MO_M44, DAG);
+    SDValue H44 = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_H44,
+                               SparcMCExpr::VK_Sparc_M44, DAG);
     H44 = DAG.getNode(ISD::SHL, DL, VT, H44, DAG.getConstant(12, MVT::i32));
-    SDValue L44 = withTargetFlags(Op, SPII::MO_L44, DAG);
+    SDValue L44 = withTargetFlags(Op, SparcMCExpr::VK_Sparc_L44, DAG);
     L44 = DAG.getNode(SPISD::Lo, DL, VT, L44);
     return DAG.getNode(ISD::ADD, DL, VT, H44, L44);
   }
   case CodeModel::Large: {
     // abs64.
-    SDValue Hi = makeHiLoPair(Op, SPII::MO_HH, SPII::MO_HM, DAG);
+    SDValue Hi = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HH,
+                              SparcMCExpr::VK_Sparc_HM, DAG);
     Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, DAG.getConstant(32, MVT::i32));
-    SDValue Lo = makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
+    SDValue Lo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+                              SparcMCExpr::VK_Sparc_LO, DAG);
     return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
   }
   }
@@ -1777,14 +1876,18 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
   if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
-    unsigned HiTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_HI22
-                     : SPII::MO_TLS_LDM_HI22);
-    unsigned LoTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_LO10
-                     : SPII::MO_TLS_LDM_LO10);
-    unsigned addTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_ADD
-                      : SPII::MO_TLS_LDM_ADD);
-    unsigned callTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_CALL
-                       : SPII::MO_TLS_LDM_CALL);
+    unsigned HiTF = ((model == TLSModel::GeneralDynamic)
+                     ? SparcMCExpr::VK_Sparc_TLS_GD_HI22
+                     : SparcMCExpr::VK_Sparc_TLS_LDM_HI22);
+    unsigned LoTF = ((model == TLSModel::GeneralDynamic)
+                     ? SparcMCExpr::VK_Sparc_TLS_GD_LO10
+                     : SparcMCExpr::VK_Sparc_TLS_LDM_LO10);
+    unsigned addTF = ((model == TLSModel::GeneralDynamic)
+                      ? SparcMCExpr::VK_Sparc_TLS_GD_ADD
+                      : SparcMCExpr::VK_Sparc_TLS_LDM_ADD);
+    unsigned callTF = ((model == TLSModel::GeneralDynamic)
+                       ? SparcMCExpr::VK_Sparc_TLS_GD_CALL
+                       : SparcMCExpr::VK_Sparc_TLS_LDM_CALL);
 
     SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
     SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
@@ -1822,17 +1925,17 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
       return Ret;
 
     SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
-                             withTargetFlags(Op, SPII::MO_TLS_LDO_HIX22, DAG));
+                 withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_HIX22, DAG));
     SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
-                             withTargetFlags(Op, SPII::MO_TLS_LDO_LOX10, DAG));
+                 withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_LOX10, DAG));
     HiLo =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
     return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
-                       withTargetFlags(Op, SPII::MO_TLS_LDO_ADD, DAG));
+                   withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_ADD, DAG));
   }
 
   if (model == TLSModel::InitialExec) {
-    unsigned ldTF     = ((PtrVT == MVT::i64)? SPII::MO_TLS_IE_LDX
-                         : SPII::MO_TLS_IE_LD);
+    unsigned ldTF     = ((PtrVT == MVT::i64)? SparcMCExpr::VK_Sparc_TLS_IE_LDX
+                         : SparcMCExpr::VK_Sparc_TLS_IE_LD);
 
     SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
 
@@ -1842,21 +1945,23 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     MFI->setHasCalls(true);
 
     SDValue TGA = makeHiLoPair(Op,
-                               SPII::MO_TLS_IE_HI22, SPII::MO_TLS_IE_LO10, DAG);
+                               SparcMCExpr::VK_Sparc_TLS_IE_HI22,
+                               SparcMCExpr::VK_Sparc_TLS_IE_LO10, DAG);
     SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
     SDValue Offset = DAG.getNode(SPISD::TLS_LD,
                                  DL, PtrVT, Ptr,
                                  withTargetFlags(Op, ldTF, DAG));
     return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
                        DAG.getRegister(SP::G7, PtrVT), Offset,
-                       withTargetFlags(Op, SPII::MO_TLS_IE_ADD, DAG));
+                       withTargetFlags(Op,
+                                       SparcMCExpr::VK_Sparc_TLS_IE_ADD, DAG));
   }
 
   assert(model == TLSModel::LocalExec);
   SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
-                           withTargetFlags(Op, SPII::MO_TLS_LE_HIX22, DAG));
+                  withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_HIX22, DAG));
   SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
-                           withTargetFlags(Op, SPII::MO_TLS_LE_LOX10, DAG));
+                  withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_LOX10, DAG));
   SDValue Offset =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
 
   return DAG.getNode(ISD::ADD, DL, PtrVT,
@@ -2334,43 +2439,64 @@ static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
   return Chain;
 }
 
-static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
+                            const SparcSubtarget *Subtarget) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   unsigned FrameReg = SP::I6;
-
-  uint64_t depth = Op.getConstantOperandVal(0);
+  unsigned stackBias = Subtarget->getStackPointerBias();
 
   SDValue FrameAddr;
-  if (depth == 0)
+
+  if (depth == 0) {
     FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
-  else {
-    // flush first to make sure the windowed registers' values are in stack
-    SDValue Chain = getFLUSHW(Op, DAG);
-    FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
-
-    for (uint64_t i = 0; i != depth; ++i) {
-      SDValue Ptr = DAG.getNode(ISD::ADD,
-                                dl, MVT::i32,
-                                FrameAddr, DAG.getIntPtrConstant(56));
-      FrameAddr = DAG.getLoad(MVT::i32, dl,
-                              Chain,
-                              Ptr,
-                              MachinePointerInfo(), false, false, false, 0);
-    }
+    if (Subtarget->is64Bit())
+      FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                              DAG.getIntPtrConstant(stackBias));
+    return FrameAddr;
   }
+
+  // flush first to make sure the windowed registers' values are in stack
+  SDValue Chain = getFLUSHW(Op, DAG);
+  FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+
+  unsigned Offset = (Subtarget->is64Bit()) ? (stackBias + 112) : 56;
+
+  while (depth--) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset));
+    FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo(),
+                            false, false, false, 0);
+  }
+  if (Subtarget->is64Bit())
+    FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                            DAG.getIntPtrConstant(stackBias));
   return FrameAddr;
 }
 
+
+static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
+                              const SparcSubtarget *Subtarget) {
+
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  return getFRAMEADDR(depth, Op, DAG, Subtarget);
+
+}
+
 static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
-                               const SparcTargetLowering &TLI) {
+                               const SparcTargetLowering &TLI,
+                               const SparcSubtarget *Subtarget) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   uint64_t depth = Op.getConstantOperandVal(0);
@@ -2380,25 +2506,20 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
     unsigned RetReg = MF.addLiveIn(SP::I7,
                                    TLI.getRegClassFor(TLI.getPointerTy()));
     RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
-  } else {
-    // Need frame address to find return address of the caller.
-    MFI->setFrameAddressIsTaken(true);
-
-    // flush first to make sure the windowed registers' values are in stack
-    SDValue Chain = getFLUSHW(Op, DAG);
-    RetAddr = DAG.getCopyFromReg(Chain, dl, SP::I6, VT);
-
-    for (uint64_t i = 0; i != depth; ++i) {
-      SDValue Ptr = DAG.getNode(ISD::ADD,
-                                dl, MVT::i32,
-                                RetAddr,
-                                DAG.getIntPtrConstant((i == depth-1)?60:56));
-      RetAddr = DAG.getLoad(MVT::i32, dl,
-                            Chain,
-                            Ptr,
-                            MachinePointerInfo(), false, false, false, 0);
-    }
+    return RetAddr;
   }
+
+  // Need frame address to find return address of the caller.
+  SDValue FrameAddr = getFRAMEADDR(depth - 1, Op, DAG, Subtarget);
+
+  unsigned Offset = (Subtarget->is64Bit()) ? 120 : 60;
+  SDValue Ptr = DAG.getNode(ISD::ADD,
+                            dl, VT,
+                            FrameAddr,
+                            DAG.getIntPtrConstant(Offset));
+  RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr,
+                        MachinePointerInfo(), false, false, false, 0);
+
   return RetAddr;
 }
 
@@ -2527,24 +2648,17 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
                      &OutChains[0], 2);
 }
 
-static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG,
-                         const SparcTargetLowering &TLI,
-                         bool is64Bit) {
-  if (Op.getValueType() == MVT::f64)
-    return LowerF64Op(Op, DAG, ISD::FNEG);
-  if (Op.getValueType() == MVT::f128)
-    return TLI.LowerF128Op(Op, DAG, ((is64Bit) ? "_Qp_neg" : "_Q_neg"), 1);
-  return Op;
-}
+static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+  assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
+         && "invalid opcode");
 
-static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   if (Op.getValueType() == MVT::f64)
-    return LowerF64Op(Op, DAG, ISD::FABS);
+    return LowerF64Op(Op, DAG, Op.getOpcode());
   if (Op.getValueType() != MVT::f128)
     return Op;
 
-  // Lower fabs on f128 to fabs on f64
-  // fabs f128 => fabs f64:sub_even64, fmov f64:sub_odd64
+  // Lower fabs/fneg on f128 to fabs/fneg on f64
+  // fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
 
   SDLoc dl(Op);
   SDValue SrcReg128 = Op.getOperand(0);
@@ -2555,7 +2669,7 @@ static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   if (isV9)
     Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
   else
-    Hi64 = LowerF64Op(Hi64, DAG, ISD::FABS);
+    Hi64 = LowerF64Op(Hi64, DAG, Op.getOpcode());
 
   SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                  dl, MVT::f128), 0);
@@ -2615,18 +2729,76 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
+// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
+// in LegalizeDAG.cpp except the order of arguments to the library function.
+static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
+                                const SparcTargetLowering &TLI)
+{
+  unsigned opcode = Op.getOpcode();
+  assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
+
+  bool isSigned = (opcode == ISD::SMULO);
+  EVT VT = MVT::i64;
+  EVT WideVT = MVT::i128;
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+
+  if (LHS.getValueType() != VT)
+    return Op;
+
+  SDValue ShiftAmt = DAG.getConstant(63, VT);
+
+  SDValue RHS = Op.getOperand(1);
+  SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
+  SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
+  SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
+
+  SDValue MulResult = TLI.makeLibCall(DAG,
+                                      RTLIB::MUL_I128, WideVT,
+                                      Args, 4, isSigned, dl).first;
+  SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+                                   MulResult, DAG.getIntPtrConstant(0));
+  SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+                                MulResult, DAG.getIntPtrConstant(1));
+  if (isSigned) {
+    SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
+    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
+  } else {
+    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, VT),
+                           ISD::SETNE);
+  }
+  // MulResult is a node with an illegal type. Because such things are not
+  // generally permitted during this phase of legalization, delete the
+  // node. The above EXTRACT_ELEMENT nodes should have been folded.
+  DAG.DeleteNode(MulResult.getNode());
+
+  SDValue Ops[2] = { BottomHalf, TopHalf } ;
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
+  // Monotonic load/stores are legal.
+  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
+    return Op;
+
+  // Otherwise, expand with a fence.
+  return SDValue();
+}
+
+
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
   bool hasHardQuad = Subtarget->hasHardQuad();
-  bool is64Bit     = Subtarget->is64Bit();
   bool isV9        = Subtarget->isV9();
 
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
 
-  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this);
-  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this,
+                                                       Subtarget);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG,
+                                                      Subtarget);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
@@ -2660,42 +2832,91 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                        getLibcallName(RTLIB::DIV_F128), 2);
   case ISD::FSQRT:              return LowerF128Op(Op, DAG,
                                        getLibcallName(RTLIB::SQRT_F128),1);
-  case ISD::FNEG:               return LowerFNEG(Op, DAG, *this, is64Bit);
-  case ISD::FABS:               return LowerFABS(Op, DAG, isV9);
+  case ISD::FABS:
+  case ISD::FNEG:               return LowerFNEGorFABS(Op, DAG, isV9);
   case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
   case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::UMULO:
+  case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
   }
 }
 
 MachineBasicBlock *
 SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const {
-  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
-  unsigned BROpcode;
-  unsigned CC;
-  DebugLoc dl = MI->getDebugLoc();
-  // Figure out the conditional branch opcode to use for this select_cc.
   switch (MI->getOpcode()) {
   default: llvm_unreachable("Unknown SELECT_CC!");
   case SP::SELECT_CC_Int_ICC:
   case SP::SELECT_CC_FP_ICC:
   case SP::SELECT_CC_DFP_ICC:
   case SP::SELECT_CC_QFP_ICC:
-    BROpcode = SP::BCOND;
-    break;
+    return expandSelectCC(MI, BB, SP::BCOND);
   case SP::SELECT_CC_Int_FCC:
   case SP::SELECT_CC_FP_FCC:
   case SP::SELECT_CC_DFP_FCC:
   case SP::SELECT_CC_QFP_FCC:
-    BROpcode = SP::FBCOND;
-    break;
+    return expandSelectCC(MI, BB, SP::FBCOND);
+
+  case SP::ATOMIC_LOAD_ADD_32:
+    return expandAtomicRMW(MI, BB, SP::ADDrr);
+  case SP::ATOMIC_LOAD_ADD_64:
+    return expandAtomicRMW(MI, BB, SP::ADDXrr);
+  case SP::ATOMIC_LOAD_SUB_32:
+    return expandAtomicRMW(MI, BB, SP::SUBrr);
+  case SP::ATOMIC_LOAD_SUB_64:
+    return expandAtomicRMW(MI, BB, SP::SUBXrr);
+  case SP::ATOMIC_LOAD_AND_32:
+    return expandAtomicRMW(MI, BB, SP::ANDrr);
+  case SP::ATOMIC_LOAD_AND_64:
+    return expandAtomicRMW(MI, BB, SP::ANDXrr);
+  case SP::ATOMIC_LOAD_OR_32:
+    return expandAtomicRMW(MI, BB, SP::ORrr);
+  case SP::ATOMIC_LOAD_OR_64:
+    return expandAtomicRMW(MI, BB, SP::ORXrr);
+  case SP::ATOMIC_LOAD_XOR_32:
+    return expandAtomicRMW(MI, BB, SP::XORrr);
+  case SP::ATOMIC_LOAD_XOR_64:
+    return expandAtomicRMW(MI, BB, SP::XORXrr);
+  case SP::ATOMIC_LOAD_NAND_32:
+    return expandAtomicRMW(MI, BB, SP::ANDrr);
+  case SP::ATOMIC_LOAD_NAND_64:
+    return expandAtomicRMW(MI, BB, SP::ANDXrr);
+
+  case SP::ATOMIC_SWAP_64:
+    return expandAtomicRMW(MI, BB, 0);
+
+  case SP::ATOMIC_LOAD_MAX_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_G);
+  case SP::ATOMIC_LOAD_MAX_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_G);
+  case SP::ATOMIC_LOAD_MIN_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_LE);
+  case SP::ATOMIC_LOAD_MIN_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_LE);
+  case SP::ATOMIC_LOAD_UMAX_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_GU);
+  case SP::ATOMIC_LOAD_UMAX_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_GU);
+  case SP::ATOMIC_LOAD_UMIN_32:
+    return expandAtomicRMW(MI, BB, SP::MOVICCrr, SPCC::ICC_LEU);
+  case SP::ATOMIC_LOAD_UMIN_64:
+    return expandAtomicRMW(MI, BB, SP::MOVXCCrr, SPCC::ICC_LEU);
   }
+}
 
-  CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
+MachineBasicBlock*
+SparcTargetLowering::expandSelectCC(MachineInstr *MI,
+                                    MachineBasicBlock *BB,
+                                    unsigned BROpcode) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -2719,7 +2940,7 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  std::next(MachineBasicBlock::iterator(MI)),
                   BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
@@ -2749,6 +2970,101 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   return BB;
 }
 
+MachineBasicBlock*
+SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     unsigned Opcode,
+                                     unsigned CondCode) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // MI is an atomic read-modify-write instruction of the form:
+  //
+  //   rd = atomicrmw<op> addr, rs2
+  //
+  // All three operands are registers.
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned AddrReg = MI->getOperand(1).getReg();
+  unsigned Rs2Reg  = MI->getOperand(2).getReg();
+
+  // SelectionDAG has already inserted memory barriers before and after MI, so
+  // we simply have to implement the operatiuon in terms of compare-and-swap.
+  //
+  //   %val0 = load %addr
+  // loop:
+  //   %val = phi %val0, %dest
+  //   %upd = op %val, %rs2
+  //   %dest = cas %addr, %val, %upd
+  //   cmp %val, %dest
+  //   bne loop
+  // done:
+  //
+  bool is64Bit = SP::I64RegsRegClass.hasSubClassEq(MRI.getRegClass(DestReg));
+  const TargetRegisterClass *ValueRC =
+    is64Bit ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+  unsigned Val0Reg = MRI.createVirtualRegister(ValueRC);
+
+  BuildMI(*MBB, MI, DL, TII.get(is64Bit ? SP::LDXri : SP::LDri), Val0Reg)
+    .addReg(AddrReg).addImm(0);
+
+  // Split the basic block MBB before MI and insert the loop block in the hole.
+  MachineFunction::iterator MFI = MBB;
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *DoneMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  ++MFI;
+  MF->insert(MFI, LoopMBB);
+  MF->insert(MFI, DoneMBB);
+
+  // Move MI and following instructions to DoneMBB.
+  DoneMBB->splice(DoneMBB->begin(), MBB, MI, MBB->end());
+  DoneMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Connect the CFG again.
+  MBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(DoneMBB);
+
+  // Build the loop block.
+  unsigned ValReg = MRI.createVirtualRegister(ValueRC);
+  // Opcode == 0 means try to write Rs2Reg directly (ATOMIC_SWAP).
+  unsigned UpdReg = (Opcode ? MRI.createVirtualRegister(ValueRC) : Rs2Reg);
+
+  BuildMI(LoopMBB, DL, TII.get(SP::PHI), ValReg)
+    .addReg(Val0Reg).addMBB(MBB)
+    .addReg(DestReg).addMBB(LoopMBB);
+
+  if (CondCode) {
+    // This is one of the min/max operations. We need a CMPrr followed by a
+    // MOVXCC/MOVICC.
+    BuildMI(LoopMBB, DL, TII.get(SP::CMPrr)).addReg(ValReg).addReg(Rs2Reg);
+    BuildMI(LoopMBB, DL, TII.get(Opcode), UpdReg)
+      .addReg(ValReg).addReg(Rs2Reg).addImm(CondCode);
+  } else if (Opcode) {
+    BuildMI(LoopMBB, DL, TII.get(Opcode), UpdReg)
+      .addReg(ValReg).addReg(Rs2Reg);
+  }
+
+  if (MI->getOpcode() == SP::ATOMIC_LOAD_NAND_32 ||
+      MI->getOpcode() == SP::ATOMIC_LOAD_NAND_64) {
+    unsigned TmpReg = UpdReg;
+    UpdReg = MRI.createVirtualRegister(ValueRC);
+    BuildMI(LoopMBB, DL, TII.get(SP::XORri), UpdReg).addReg(TmpReg).addImm(-1);
+  }
+
+  BuildMI(LoopMBB, DL, TII.get(is64Bit ? SP::CASXrr : SP::CASrr), DestReg)
+    .addReg(AddrReg).addReg(ValReg).addReg(UpdReg)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  BuildMI(LoopMBB, DL, TII.get(SP::CMPrr)).addReg(ValReg).addReg(DestReg);
+  BuildMI(LoopMBB, DL, TII.get(is64Bit ? SP::BPXCC : SP::BCOND))
+    .addMBB(LoopMBB).addImm(SPCC::ICC_NE);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
 //===----------------------------------------------------------------------===//
 //                         Sparc Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -2761,12 +3077,72 @@ SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
     switch (Constraint[0]) {
     default:  break;
     case 'r': return C_RegisterClass;
+    case 'I': // SIMM13
+      return C_Other;
     }
   }
 
   return TargetLowering::getConstraintType(Constraint);
 }
 
+TargetLowering::ConstraintWeight SparcTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                               const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'I': // SIMM13
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (isInt<13>(C->getSExtValue()))
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void SparcTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op,
+                             std::string &Constraint,
+                             std::vector<SDValue> &Ops,
+                             SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<13>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+        break;
+      }
+      return;
+    }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
 std::pair<unsigned, const TargetRegisterClass*>
 SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                   MVT VT) const {
@@ -2775,6 +3151,26 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'r':
       return std::make_pair(0U, &SP::IntRegsRegClass);
     }
+  } else  if (!Constraint.empty() && Constraint.size() <= 5
+              && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
+    // constraint = '{r<d>}'
+    // Remove the braces from around the name.
+    StringRef name(Constraint.data()+1, Constraint.size()-2);
+    // Handle register aliases:
+    //       r0-r7   -> g0-g7
+    //       r8-r15  -> o0-o7
+    //       r16-r23 -> l0-l7
+    //       r24-r31 -> i0-i7
+    uint64_t intVal = 0;
+    if (name.substr(0, 1).equals("r")
+        && !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
+      const char regTypes[] = { 'g', 'o', 'l', 'i' };
+      char regType = regTypes[intVal/8];
+      char regIdx = '0' + (intVal % 8);
+      char tmp[] = { '{', regType, regIdx, '}', 0 };
+      std::string newConstraint = std::string(tmp);
+      return TargetLowering::getRegForInlineAsmConstraint(newConstraint, VT);
+    }
   }
 
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 2659fc8..f7b45d0 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -73,6 +73,13 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintWeight
+    getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                   const char *constraint) const;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const;
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
 
@@ -158,6 +165,13 @@ namespace llvm {
     virtual void ReplaceNodeResults(SDNode *N,
                                     SmallVectorImpl<SDValue>& Results,
                                     SelectionDAG &DAG) const;
+
+    MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned BROpcode) const;
+    MachineBasicBlock *expandAtomicRMW(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode,
+                                       unsigned CondCode = 0) const;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 8656de5..a34ce26 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -141,41 +141,43 @@ def : Pat<(i64 imm:$val),
 let Predicates = [Is64Bit] in {
 
 // Register-register instructions.
-
-def : Pat<(and i64:$a, i64:$b), (ANDrr $a, $b)>;
-def : Pat<(or  i64:$a, i64:$b), (ORrr  $a, $b)>;
-def : Pat<(xor i64:$a, i64:$b), (XORrr $a, $b)>;
-
-def : Pat<(and i64:$a, (not i64:$b)), (ANDNrr $a, $b)>;
-def : Pat<(or  i64:$a, (not i64:$b)), (ORNrr  $a, $b)>;
-def : Pat<(xor i64:$a, (not i64:$b)), (XNORrr $a, $b)>;
-
-def : Pat<(add i64:$a, i64:$b), (ADDrr $a, $b)>;
-def : Pat<(sub i64:$a, i64:$b), (SUBrr $a, $b)>;
-
-def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
-
-def : Pat<(tlsadd i64:$a, i64:$b, tglobaltlsaddr:$sym),
-          (TLS_ADDrr $a, $b, $sym)>;
-
-// Register-immediate instructions.
-
-def : Pat<(and i64:$a, (i64 simm13:$b)), (ANDri $a, (as_i32imm $b))>;
-def : Pat<(or  i64:$a, (i64 simm13:$b)), (ORri  $a, (as_i32imm $b))>;
-def : Pat<(xor i64:$a, (i64 simm13:$b)), (XORri $a, (as_i32imm $b))>;
-
-def : Pat<(add i64:$a, (i64 simm13:$b)), (ADDri $a, (as_i32imm $b))>;
-def : Pat<(sub i64:$a, (i64 simm13:$b)), (SUBri $a, (as_i32imm $b))>;
-
-def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
-
-def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+let isCodeGenOnly = 1 in {
+defm ANDX    : F3_12<"and", 0b000001, and, I64Regs, i64, i64imm>;
+defm ORX     : F3_12<"or",  0b000010, or,  I64Regs, i64, i64imm>;
+defm XORX    : F3_12<"xor", 0b000011, xor, I64Regs, i64, i64imm>;
+
+def ANDXNrr  : F3_1<2, 0b000101,
+                 (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                 "andn $b, $c, $dst",
+                 [(set i64:$dst, (and i64:$b, (not i64:$c)))]>;
+def ORXNrr   : F3_1<2, 0b000110,
+                 (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                 "orn $b, $c, $dst",
+                 [(set i64:$dst, (or i64:$b, (not i64:$c)))]>;
+def XNORXrr  : F3_1<2, 0b000111,
+                   (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set i64:$dst, (not (xor i64:$b, i64:$c)))]>;
+
+defm ADDX    : F3_12<"add", 0b000000, add, I64Regs, i64, i64imm>;
+defm SUBX    : F3_12<"sub", 0b000100, sub, I64Regs, i64, i64imm>;
+
+def TLS_ADDXrr : F3_1<2, 0b000000, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, I64Regs:$rs2, TLSSym:$sym),
+                   "add $rs1, $rs2, $rd, $sym",
+                   [(set i64:$rd,
+                       (tlsadd i64:$rs1, i64:$rs2, tglobaltlsaddr:$sym))]>;
 
 // "LEA" form of add
 def LEAX_ADDri : F3_2<2, 0b000000,
                      (outs I64Regs:$dst), (ins MEMri:$addr),
                      "add ${addr:arith}, $dst",
                      [(set iPTR:$dst, ADDRri:$addr)]>;
+}
+
+def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
 
 } // Predicates = [Is64Bit]
 
@@ -191,9 +193,9 @@ def MULXrr : F3_1<2, 0b001001,
                   "mulx $rs1, $rs2, $rd",
                   [(set i64:$rd, (mul i64:$rs1, i64:$rs2))]>;
 def MULXri : F3_2<2, 0b001001,
-                  (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$i),
-                  "mulx $rs1, $i, $rd",
-                  [(set i64:$rd, (mul i64:$rs1, (i64 simm13:$i)))]>;
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                  "mulx $rs1, $simm13, $rd",
+                  [(set i64:$rd, (mul i64:$rs1, (i64 simm13:$simm13)))]>;
 
 // Division can trap.
 let hasSideEffects = 1 in {
@@ -202,18 +204,18 @@ def SDIVXrr : F3_1<2, 0b101101,
                    "sdivx $rs1, $rs2, $rd",
                    [(set i64:$rd, (sdiv i64:$rs1, i64:$rs2))]>;
 def SDIVXri : F3_2<2, 0b101101,
-                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$i),
-                   "sdivx $rs1, $i, $rd",
-                   [(set i64:$rd, (sdiv i64:$rs1, (i64 simm13:$i)))]>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                   "sdivx $rs1, $simm13, $rd",
+                   [(set i64:$rd, (sdiv i64:$rs1, (i64 simm13:$simm13)))]>;
 
 def UDIVXrr : F3_1<2, 0b001101,
                    (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
                    "udivx $rs1, $rs2, $rd",
                    [(set i64:$rd, (udiv i64:$rs1, i64:$rs2))]>;
 def UDIVXri : F3_2<2, 0b001101,
-                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$i),
-                   "udivx $rs1, $i, $rd",
-                   [(set i64:$rd, (udiv i64:$rs1, (i64 simm13:$i)))]>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                   "udivx $rs1, $simm13, $rd",
+                   [(set i64:$rd, (udiv i64:$rs1, (i64 simm13:$simm13)))]>;
 } // hasSideEffects = 1
 
 } // Predicates = [Is64Bit]
@@ -233,15 +235,10 @@ def UDIVXri : F3_2<2, 0b001101,
 let Predicates = [Is64Bit] in {
 
 // 64-bit loads.
-def LDXrr  : F3_1<3, 0b001011,
-                  (outs I64Regs:$dst), (ins MEMrr:$addr),
-                  "ldx [$addr], $dst",
-                  [(set i64:$dst, (load ADDRrr:$addr))]>;
-def LDXri  : F3_2<3, 0b001011,
-                  (outs I64Regs:$dst), (ins MEMri:$addr),
-                  "ldx [$addr], $dst",
-                  [(set i64:$dst, (load ADDRri:$addr))]>;
-let mayLoad = 1 in
+let DecoderMethod = "DecodeLoadInt" in
+  defm LDX   : Load<"ldx", 0b001011, load, I64Regs, i64>;
+
+let mayLoad = 1, isCodeGenOnly = 1, isAsmParserOnly = 1 in
   def TLS_LDXrr : F3_1<3, 0b001011,
                        (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
                        "ldx [$addr], $dst, $sym",
@@ -274,24 +271,12 @@ def : Pat<(i64 (extloadi32 ADDRrr:$addr)),  (LDrr ADDRrr:$addr)>;
 def : Pat<(i64 (extloadi32 ADDRri:$addr)),  (LDri ADDRri:$addr)>;
 
 // Sign-extending load of i32 into i64 is a new SPARC v9 instruction.
-def LDSWrr : F3_1<3, 0b001011,
-                 (outs I64Regs:$dst), (ins MEMrr:$addr),
-                 "ldsw [$addr], $dst",
-                 [(set i64:$dst, (sextloadi32 ADDRrr:$addr))]>;
-def LDSWri : F3_2<3, 0b001011,
-                 (outs I64Regs:$dst), (ins MEMri:$addr),
-                 "ldsw [$addr], $dst",
-                 [(set i64:$dst, (sextloadi32 ADDRri:$addr))]>;
+let DecoderMethod = "DecodeLoadInt" in
+  defm LDSW   : Load<"ldsw", 0b001000, sextloadi32, I64Regs, i64>;
 
 // 64-bit stores.
-def STXrr  : F3_1<3, 0b001110,
-                 (outs), (ins MEMrr:$addr, I64Regs:$src),
-                 "stx $src, [$addr]",
-                 [(store i64:$src, ADDRrr:$addr)]>;
-def STXri  : F3_2<3, 0b001110,
-                 (outs), (ins MEMri:$addr, I64Regs:$src),
-                 "stx $src, [$addr]",
-                 [(store i64:$src, ADDRri:$addr)]>;
+let DecoderMethod = "DecodeStoreInt" in
+  defm STX    : Store<"stx", 0b001110, store,  I64Regs, i64>;
 
 // Truncating stores from i64 are identical to the i32 stores.
 def : Pat<(truncstorei8  i64:$src, ADDRrr:$addr), (STBrr ADDRrr:$addr, $src)>;
@@ -311,6 +296,7 @@ def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
 //===----------------------------------------------------------------------===//
 // 64-bit Conditionals.
 //===----------------------------------------------------------------------===//
+
 //
 // Flag-setting instructions like subcc and addcc set both icc and xcc flags.
 // The icc flags correspond to the 32-bit result, and the xcc are for the
@@ -321,35 +307,121 @@ def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
 
 let Predicates = [Is64Bit] in {
 
-let Uses = [ICC] in
-def BPXCC : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
-                     "b$cond %xcc, $imm22",
-                     [(SPbrxcc bb:$imm22, imm:$cond)]>;
+let Uses = [ICC], cc = 0b10 in
+  defm BPX : IPredBranch<"%xcc", [(SPbrxcc bb:$imm19, imm:$cond)]>;
 
 // Conditional moves on %xcc.
 let Uses = [ICC], Constraints = "$f = $rd" in {
-def MOVXCCrr : Pseudo<(outs IntRegs:$rd),
+let intcc = 1, cc = 0b10 in {
+def MOVXCCrr : F4_1<0b101100, (outs IntRegs:$rd),
                       (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
                       "mov$cond %xcc, $rs2, $rd",
                       [(set i32:$rd,
                        (SPselectxcc i32:$rs2, i32:$f, imm:$cond))]>;
-def MOVXCCri : Pseudo<(outs IntRegs:$rd),
-                      (ins i32imm:$i, IntRegs:$f, CCOp:$cond),
-                      "mov$cond %xcc, $i, $rd",
+def MOVXCCri : F4_2<0b101100, (outs IntRegs:$rd),
+                      (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $simm11, $rd",
                       [(set i32:$rd,
-                       (SPselectxcc simm11:$i, i32:$f, imm:$cond))]>;
-def FMOVS_XCC : Pseudo<(outs FPRegs:$rd),
+                       (SPselectxcc simm11:$simm11, i32:$f, imm:$cond))]>;
+} // cc
+
+let intcc = 1, opf_cc = 0b10 in {
+def FMOVS_XCC : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
                       (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
                       "fmovs$cond %xcc, $rs2, $rd",
                       [(set f32:$rd,
                        (SPselectxcc f32:$rs2, f32:$f, imm:$cond))]>;
-def FMOVD_XCC : Pseudo<(outs DFPRegs:$rd),
+def FMOVD_XCC : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
                       (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
                       "fmovd$cond %xcc, $rs2, $rd",
                       [(set f64:$rd,
                        (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
+def FMOVQ_XCC : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+                      (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+                      "fmovq$cond %xcc, $rs2, $rd",
+                      [(set f128:$rd,
+                       (SPselectxcc f128:$rs2, f128:$f, imm:$cond))]>;
+} // opf_cc
 } // Uses, Constraints
 
+// Branch On integer register with Prediction (BPr).
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in
+multiclass BranchOnReg<bits<3> cond, string OpcStr> {
+  def napt : F2_4<cond, 0, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, " $rs1, $imm16"), []>;
+  def apt  : F2_4<cond, 1, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",a $rs1, $imm16"), []>;
+  def napn  : F2_4<cond, 0, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",pn $rs1, $imm16"), []>;
+  def apn : F2_4<cond, 1, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",a,pn $rs1, $imm16"), []>;
+}
+
+multiclass bpr_alias<string OpcStr, Instruction NAPT, Instruction APT> {
+  def : InstAlias<!strconcat(OpcStr, ",pt $rs1, $imm16"),
+                  (NAPT I64Regs:$rs1, bprtarget16:$imm16)>;
+  def : InstAlias<!strconcat(OpcStr, ",a,pt $rs1, $imm16"),
+                  (APT I64Regs:$rs1, bprtarget16:$imm16)>;
+}
+
+defm BPZ   : BranchOnReg<0b001, "brz">;
+defm BPLEZ : BranchOnReg<0b010, "brlez">;
+defm BPLZ  : BranchOnReg<0b011, "brlz">;
+defm BPNZ  : BranchOnReg<0b101, "brnz">;
+defm BPGZ  : BranchOnReg<0b110, "brgz">;
+defm BPGEZ : BranchOnReg<0b111, "brgez">;
+
+defm : bpr_alias<"brz",   BPZnapt,   BPZapt  >;
+defm : bpr_alias<"brlez", BPLEZnapt, BPLEZapt>;
+defm : bpr_alias<"brlz",  BPLZnapt,  BPLZapt >;
+defm : bpr_alias<"brnz",  BPNZnapt,  BPNZapt >;
+defm : bpr_alias<"brgz",  BPGZnapt,  BPGZapt >;
+defm : bpr_alias<"brgez", BPGEZnapt, BPGEZapt>;
+
+// Move integer register on register condition (MOVr).
+multiclass MOVR< bits<3> rcond,  string OpcStr> {
+  def rr : F4_4r<0b101111, 0b00000, rcond, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, IntRegs:$rs2),
+                   !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+  def ri : F4_4i<0b101111, rcond, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, i64imm:$simm10),
+                   !strconcat(OpcStr, " $rs1, $simm10, $rd"), []>;
+}
+
+defm MOVRRZ  : MOVR<0b001, "movrz">;
+defm MOVRLEZ : MOVR<0b010, "movrlez">;
+defm MOVRLZ  : MOVR<0b011, "movrlz">;
+defm MOVRNZ  : MOVR<0b101, "movrnz">;
+defm MOVRGZ  : MOVR<0b110, "movrgz">;
+defm MOVRGEZ : MOVR<0b111, "movrgez">;
+
+// Move FP register on integer register condition (FMOVr).
+multiclass FMOVR<bits<3> rcond, string OpcStr> {
+
+  def S : F4_4r<0b110101, 0b00101, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrs", OpcStr)," $rs1, $rs2, $rd"),
+                []>;
+  def D : F4_4r<0b110101, 0b00110, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrd", OpcStr)," $rs1, $rs2, $rd"),
+                []>;
+  def Q : F4_4r<0b110101, 0b00111, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrq", OpcStr)," $rs1, $rs2, $rd"),
+                []>, Requires<[HasHardQuad]>;
+}
+
+let Predicates = [HasV9] in {
+  defm FMOVRZ   : FMOVR<0b001, "z">;
+  defm FMOVRLEZ : FMOVR<0b010, "lez">;
+  defm FMOVRLZ  : FMOVR<0b011, "lz">;
+  defm FMOVRNZ  : FMOVR<0b101, "nz">;
+  defm FMOVRGZ  : FMOVR<0b110, "gz">;
+  defm FMOVRGEZ : FMOVR<0b111, "gez">;
+}
+
 //===----------------------------------------------------------------------===//
 // 64-bit Floating Point Conversions.
 //===----------------------------------------------------------------------===//
@@ -357,31 +429,31 @@ def FMOVD_XCC : Pseudo<(outs DFPRegs:$rd),
 let Predicates = [Is64Bit] in {
 
 def FXTOS : F3_3u<2, 0b110100, 0b010000100,
-                 (outs FPRegs:$dst), (ins DFPRegs:$src),
-                 "fxtos $src, $dst",
-                 [(set FPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtos $rs2, $rd",
+                 [(set FPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
 def FXTOD : F3_3u<2, 0b110100, 0b010001000,
-                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
-                 "fxtod $src, $dst",
-                 [(set DFPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtod $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
 def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
-                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
-                 "fxtoq $src, $dst",
-                 [(set QFPRegs:$dst, (SPxtof DFPRegs:$src))]>,
+                 (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtoq $rs2, $rd",
+                 [(set QFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>,
                  Requires<[HasHardQuad]>;
 
 def FSTOX : F3_3u<2, 0b110100, 0b010000001,
-                 (outs DFPRegs:$dst), (ins FPRegs:$src),
-                 "fstox $src, $dst",
-                 [(set DFPRegs:$dst, (SPftox FPRegs:$src))]>;
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox FPRegs:$rs2))]>;
 def FDTOX : F3_3u<2, 0b110100, 0b010000010,
-                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
-                 "fdtox $src, $dst",
-                 [(set DFPRegs:$dst, (SPftox DFPRegs:$src))]>;
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox DFPRegs:$rs2))]>;
 def FQTOX : F3_3u<2, 0b110100, 0b010000011,
-                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
-                 "fqtox $src, $dst",
-                 [(set DFPRegs:$dst, (SPftox QFPRegs:$src))]>,
+                 (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox QFPRegs:$rs2))]>,
                  Requires<[HasHardQuad]>;
 
 } // Predicates = [Is64Bit]
@@ -402,3 +474,100 @@ def : Pat<(SPselectfcc (i64 simm11:$t), i64:$f, imm:$cond),
           (MOVFCCri (as_i32imm $t), $f, imm:$cond)>;
 
 } // Predicates = [Is64Bit]
+
+
+// 64 bit SETHI
+let Predicates = [Is64Bit], isCodeGenOnly = 1 in {
+def SETHIXi : F2_1<0b100,
+                   (outs IntRegs:$rd), (ins i64imm:$imm22),
+                   "sethi $imm22, $rd",
+                   [(set i64:$rd, SETHIimm:$imm22)]>;
+}
+
+// ATOMICS.
+let Predicates = [Is64Bit], Constraints = "$swap = $rd" in {
+  def CASXrr: F3_1_asi<3, 0b111110, 0b10000000,
+                (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2,
+                                     I64Regs:$swap),
+                 "casx [$rs1], $rs2, $rd",
+                 [(set i64:$rd,
+                     (atomic_cmp_swap i64:$rs1, i64:$rs2, i64:$swap))]>;
+
+} // Predicates = [Is64Bit], Constraints = ...
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(atomic_fence imm, imm), (MEMBARi 0xf)>;
+
+// atomic_load_64 addr -> load addr
+def : Pat<(i64 (atomic_load ADDRrr:$src)), (LDXrr ADDRrr:$src)>;
+def : Pat<(i64 (atomic_load ADDRri:$src)), (LDXri ADDRri:$src)>;
+
+// atomic_store_64 val, addr -> store val, addr
+def : Pat<(atomic_store ADDRrr:$dst, i64:$val), (STXrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store ADDRri:$dst, i64:$val), (STXri ADDRri:$dst, $val)>;
+
+} // Predicates = [Is64Bit]
+
+let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1,
+    Defs = [ICC] in
+multiclass AtomicRMW<SDPatternOperator op32, SDPatternOperator op64> {
+
+  def _32 : Pseudo<(outs IntRegs:$rd),
+                   (ins ptr_rc:$addr, IntRegs:$rs2), "",
+                   [(set i32:$rd, (op32 iPTR:$addr, i32:$rs2))]>;
+
+  let Predicates = [Is64Bit] in
+  def _64 : Pseudo<(outs I64Regs:$rd),
+                   (ins ptr_rc:$addr, I64Regs:$rs2), "",
+                   [(set i64:$rd, (op64 iPTR:$addr, i64:$rs2))]>;
+}
+
+defm ATOMIC_LOAD_ADD  : AtomicRMW<atomic_load_add_32,  atomic_load_add_64>;
+defm ATOMIC_LOAD_SUB  : AtomicRMW<atomic_load_sub_32,  atomic_load_sub_64>;
+defm ATOMIC_LOAD_AND  : AtomicRMW<atomic_load_and_32,  atomic_load_and_64>;
+defm ATOMIC_LOAD_OR   : AtomicRMW<atomic_load_or_32,   atomic_load_or_64>;
+defm ATOMIC_LOAD_XOR  : AtomicRMW<atomic_load_xor_32,  atomic_load_xor_64>;
+defm ATOMIC_LOAD_NAND : AtomicRMW<atomic_load_nand_32, atomic_load_nand_64>;
+defm ATOMIC_LOAD_MIN  : AtomicRMW<atomic_load_min_32,  atomic_load_min_64>;
+defm ATOMIC_LOAD_MAX  : AtomicRMW<atomic_load_max_32,  atomic_load_max_64>;
+defm ATOMIC_LOAD_UMIN : AtomicRMW<atomic_load_umin_32, atomic_load_umin_64>;
+defm ATOMIC_LOAD_UMAX : AtomicRMW<atomic_load_umax_32, atomic_load_umax_64>;
+
+// There is no 64-bit variant of SWAP, so use a pseudo.
+let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1,
+    Defs = [ICC], Predicates = [Is64Bit] in
+def ATOMIC_SWAP_64 : Pseudo<(outs I64Regs:$rd),
+                            (ins ptr_rc:$addr, I64Regs:$rs2), "",
+                            [(set i64:$rd,
+                                  (atomic_swap_64 iPTR:$addr, i64:$rs2))]>;
+
+let Predicates = [Is64Bit], hasSideEffects = 1, Uses = [ICC], cc = 0b10 in
+ defm TXCC : TRAP<"%xcc">;
+
+// Global addresses, constant pool entries
+let Predicates = [Is64Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORXri (i64 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORXri (i64 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORXri (i64 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDXri (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORXri  (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORXri (i64 G0), tblockaddress:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDXri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDXri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDXri $r, tblockaddress:$in)>;
+}
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
new file mode 100644
index 0000000..33c2aa1
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -0,0 +1,325 @@
+//===-- SparcInstrAliases.td - Instruction Aliases for Sparc Target -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction aliases for Sparc.
+//===----------------------------------------------------------------------===//
+
+// Instruction aliases for conditional moves.
+
+// mov<cond> <ccreg> rs2, rd
+multiclass intcond_mov_alias<string cond, int condVal, string ccreg,
+                          Instruction movrr, Instruction movri,
+                          Instruction fmovs, Instruction fmovd> {
+
+  // mov<cond> (%icc|%xcc), rs2, rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (movrr IntRegs:$rd, IntRegs:$rs2, condVal)>;
+
+  // mov<cond> (%icc|%xcc), simm11, rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+                             ", $simm11, $rd"),
+                  (movri IntRegs:$rd, i32imm:$simm11, condVal)>;
+
+  // fmovs<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("fmovs", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (fmovs FPRegs:$rd, FPRegs:$rs2, condVal)>;
+
+  // fmovd<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("fmovd", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (fmovd DFPRegs:$rd, DFPRegs:$rs2, condVal)>;
+}
+
+// mov<cond> <ccreg> rs2, rd
+multiclass fpcond_mov_alias<string cond, int condVal,
+                           Instruction movrr, Instruction movri,
+                           Instruction fmovs, Instruction fmovd> {
+
+  // mov<cond> %fcc[0-3], rs2, rd
+  def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $rs2, $rd"),
+                  (movrr IntRegs:$rd, FCCRegs:$cc, IntRegs:$rs2, condVal)>;
+
+  // mov<cond> %fcc[0-3], simm11, rd
+  def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $simm11, $rd"),
+                  (movri IntRegs:$rd, FCCRegs:$cc, i32imm:$simm11, condVal)>;
+
+  // fmovs<cond> %fcc[0-3], $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovs", cond), " $cc, $rs2, $rd"),
+                  (fmovs FPRegs:$rd, FCCRegs:$cc, FPRegs:$rs2, condVal)>;
+
+  // fmovd<cond> %fcc[0-3], $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovd", cond), " $cc, $rs2, $rd"),
+                  (fmovd DFPRegs:$rd, FCCRegs:$cc, DFPRegs:$rs2, condVal)>;
+}
+
+// Instruction aliases for integer conditional branches and moves.
+multiclass int_cond_alias<string cond, int condVal> {
+
+  // b<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " $imm"),
+                  (BCOND brtarget:$imm, condVal)>;
+
+  // b<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a $imm"),
+                  (BCONDA brtarget:$imm, condVal)>;
+
+  // b<cond> %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " %icc, $imm"),
+                  (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,pt %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %icc, $imm"),
+                  (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a %icc, $imm"),
+                  (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a,pt %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %icc, $imm"),
+                  (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,pn %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %icc, $imm"),
+                  (BPICCNT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a,pn %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %icc, $imm"),
+                  (BPICCANT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond> %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " %xcc, $imm"),
+                  (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,pt %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %xcc, $imm"),
+                  (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a %xcc, $imm"),
+                  (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a,pt %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %xcc, $imm"),
+                  (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,pn %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %xcc, $imm"),
+                  (BPXCCNT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a,pn %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %xcc, $imm"),
+                  (BPXCCANT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+
+  defm : intcond_mov_alias<cond, condVal, " %icc",
+                            MOVICCrr, MOVICCri,
+                            FMOVS_ICC, FMOVD_ICC>, Requires<[HasV9]>;
+
+  defm : intcond_mov_alias<cond, condVal, " %xcc",
+                            MOVXCCrr, MOVXCCri,
+                            FMOVS_XCC, FMOVD_XCC>, Requires<[Is64Bit]>;
+
+  // fmovq<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %icc, $rs2, $rd"),
+                  (FMOVQ_ICC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+                  Requires<[HasV9, HasHardQuad]>;
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %xcc, $rs2, $rd"),
+                  (FMOVQ_XCC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+                  Requires<[Is64Bit, HasHardQuad]>;
+
+  // t<cond> %icc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $rs2"),
+                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> %icc,  rs => t<cond> %icc, G0 + rs
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs2"),
+                  (TICCrr G0, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> %xcc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $rs2"),
+                  (TXCCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> %xcc, rs => t<cond> %xcc, G0 + rs
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs2"),
+                  (TXCCrr G0, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> rs1 + rs2 => t<cond> %icc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>;
+
+  // t<cond> rs=> t<cond> %icc,  G0 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs2"),
+                  (TICCrr G0, IntRegs:$rs2, condVal)>;
+
+  // t<cond> %icc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $imm"),
+                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %icc, imm => t<cond> %icc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $imm"),
+                  (TICCri G0, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $imm"),
+                  (TXCCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, imm => t<cond> %xcc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $imm"),
+                  (TXCCri G0, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> rs1 + imm => t<cond> %icc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $imm"),
+                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>;
+
+  // t<cond> imm => t<cond> %icc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $imm"),
+                  (TICCri G0, i32imm:$imm, condVal)>;
+
+}
+
+
+// Instruction aliases for floating point conditional branches and moves.
+multiclass fp_cond_alias<string cond, int condVal> {
+
+  // fb<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), " $imm"),
+                  (FBCOND brtarget:$imm, condVal), 0>;
+
+  // fb<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $imm"),
+                  (FBCONDA brtarget:$imm, condVal), 0>;
+
+  // fb<cond> %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), " $cc, $imm"),
+                  (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,pt %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",pt $cc, $imm"),
+                  (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,a %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $cc, $imm"),
+                  (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,a,pt %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pt $cc, $imm"),
+                  (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+                   Requires<[HasV9]>;
+
+  // fb<cond>,pn %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",pn $cc, $imm"),
+                  (BPFCCNT brtarget:$imm, condVal, FCCRegs:$cc)>,
+                   Requires<[HasV9]>;
+
+  // fb<cond>,a,pn %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pn $cc, $imm"),
+                  (BPFCCANT brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  defm : fpcond_mov_alias<cond, condVal,
+                          V9MOVFCCrr, V9MOVFCCri,
+                          V9FMOVS_FCC, V9FMOVD_FCC>, Requires<[HasV9]>;
+
+  // fmovq<cond> %fcc0, $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " $cc, $rs2, $rd"),
+                  (V9FMOVQ_FCC QFPRegs:$rd, FCCRegs:$cc, QFPRegs:$rs2,
+                                                          condVal)>,
+                  Requires<[HasV9, HasHardQuad]>;
+}
+
+defm : int_cond_alias<"a",    0b1000>;
+defm : int_cond_alias<"n",    0b0000>;
+defm : int_cond_alias<"ne",   0b1001>;
+defm : int_cond_alias<"e",    0b0001>;
+defm : int_cond_alias<"g",    0b1010>;
+defm : int_cond_alias<"le",   0b0010>;
+defm : int_cond_alias<"ge",   0b1011>;
+defm : int_cond_alias<"l",    0b0011>;
+defm : int_cond_alias<"gu",   0b1100>;
+defm : int_cond_alias<"leu",  0b0100>;
+defm : int_cond_alias<"cc",   0b1101>;
+defm : int_cond_alias<"cs",   0b0101>;
+defm : int_cond_alias<"pos",  0b1110>;
+defm : int_cond_alias<"neg",  0b0110>;
+defm : int_cond_alias<"vc",   0b1111>;
+defm : int_cond_alias<"vs",   0b0111>;
+
+defm : fp_cond_alias<"a",     0b0000>;
+defm : fp_cond_alias<"n",     0b1000>;
+defm : fp_cond_alias<"u",     0b0111>;
+defm : fp_cond_alias<"g",     0b0110>;
+defm : fp_cond_alias<"ug",    0b0101>;
+defm : fp_cond_alias<"l",     0b0100>;
+defm : fp_cond_alias<"ul",    0b0011>;
+defm : fp_cond_alias<"lg",    0b0010>;
+defm : fp_cond_alias<"ne",    0b0001>;
+defm : fp_cond_alias<"e",     0b1001>;
+defm : fp_cond_alias<"ue",    0b1010>;
+defm : fp_cond_alias<"ge",    0b1011>;
+defm : fp_cond_alias<"uge",   0b1100>;
+defm : fp_cond_alias<"le",    0b1101>;
+defm : fp_cond_alias<"ule",   0b1110>;
+defm : fp_cond_alias<"o",     0b1111>;
+
+// Instruction aliases for JMPL.
+
+// jmp addr -> jmpl addr, %g0
+def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr)>;
+def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr)>;
+
+// call addr -> jmpl addr, %o7
+def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr)>;
+def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr)>;
+
+// retl -> RETL 8
+def : InstAlias<"retl", (RETL 8)>;
+
+// ret -> RET 8
+def : InstAlias<"ret", (RET 8)>;
+
+// mov reg, rd -> or %g0, reg, rd
+def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+
+// mov simm13, rd -> or %g0, simm13, rd
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+
+// restore -> restore %g0, %g0, %g0
+def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+
+def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>;
+def : MnemonicAlias<"addccc", "addxcc">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"subc", "subx">, Requires<[HasV9]>;
+def : MnemonicAlias<"subccc", "subxcc">, Requires<[HasV9]>;
+
+
+def : InstAlias<"fcmps $rs1, $rs2", (V9FCMPS FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmpd $rs1, $rs2", (V9FCMPD FCC0, DFPRegs:$rs1, DFPRegs:$rs2)>;
+def : InstAlias<"fcmpq $rs1, $rs2", (V9FCMPQ FCC0, QFPRegs:$rs1, QFPRegs:$rs2)>,
+                Requires<[HasHardQuad]>;
+
+def : InstAlias<"fcmpes $rs1, $rs2", (V9FCMPES FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmped $rs1, $rs2", (V9FCMPED FCC0, DFPRegs:$rs1,
+                                                     DFPRegs:$rs2)>;
+def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1,
+                                                     QFPRegs:$rs2)>,
+                Requires<[HasHardQuad]>;
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index afa2874..3b5e238 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -12,6 +12,7 @@ class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
   field bits<32> Inst;
 
   let Namespace = "SP";
+  let Size = 4;
 
   bits<2> op;
   let Inst{31-30} = op;               // Top two bits are the 'op' field
@@ -20,6 +21,9 @@ class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
   dag InOperandList = ins;
   let AsmString   = asmstr;
   let Pattern = pattern;
+
+  let DecoderNamespace = "Sparc";
+  field bits<32> SoftFail = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -47,17 +51,51 @@ class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{29-25} = rd;
 }
 
-class F2_2<bits<3> op2Val, dag outs, dag ins, string asmstr,
+class F2_2<bits<3> op2Val, bit annul, dag outs, dag ins, string asmstr,
            list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
   bits<4>   cond;
-  bit       annul = 0;     // currently unused
-
   let op2         = op2Val;
 
   let Inst{29}    = annul;
   let Inst{28-25} = cond;
 }
 
+class F2_3<bits<3> op2Val, bit annul, bit pred,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<2>  cc;
+  bits<4>  cond;
+  bits<19> imm19;
+
+  let op          = 0;    // op = 0
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+  let Inst{24-22} = op2Val;
+  let Inst{21-20} = cc;
+  let Inst{19}    = pred;
+  let Inst{18-0}  = imm19;
+}
+
+class F2_4<bits<3> cond, bit annul, bit pred,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<16> imm16;
+  bits<5>  rs1;
+
+  let op          = 0;    // op = 0
+
+  let Inst{29}    = annul;
+  let Inst{28}    = 0;
+  let Inst{27-25} = cond;
+  let Inst{24-22} = 0b011;
+  let Inst{21-20} = imm16{15-14};
+  let Inst{19}    = pred;
+  let Inst{18-14} = rs1;
+  let Inst{13-0}  = imm16{13-0};
+}
+
+
 //===----------------------------------------------------------------------===//
 // Format #3 instruction classes in the Sparc
 //===----------------------------------------------------------------------===//
@@ -75,9 +113,8 @@ class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 // Specific F3 classes: SparcV8 manual, page 44
 //
-class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+class F3_1_asi<bits<2> opVal, bits<6> op3val, bits<8> asi, dag outs, dag ins,
            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
-  bits<8> asi = 0; // asi not currently used
   bits<5> rs2;
 
   let op         = opVal;
@@ -88,6 +125,10 @@ class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
   let Inst{4-0}  = rs2;
 }
 
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins, string asmstr,
+       list<dag> pattern> : F3_1_asi<opVal, op3val, 0, outs, ins,
+                                                     asmstr, pattern>;
+
 class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
   bits<13> simm13;
@@ -131,7 +172,6 @@ class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
 
   let op         = opVal;
   let op3        = op3val;
-  let rd         = 0;
 
   let Inst{13-5} = opfval;   // fp opcode
   let Inst{4-0}  = rs2;
@@ -168,12 +208,12 @@ class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
 // Define rr and ri shift instructions with patterns.
 multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
                 ValueType VT, RegisterClass RC> {
-  def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs, IntRegs:$rs2),
-                 !strconcat(OpcStr, " $rs, $rs2, $rd"),
-                 [(set VT:$rd, (OpNode VT:$rs, i32:$rs2))]>;
-  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs, i32imm:$shcnt),
-                 !strconcat(OpcStr, " $rs, $shcnt, $rd"),
-                 [(set VT:$rd, (OpNode VT:$rs, (i32 imm:$shcnt)))]>;
+  def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))]>;
+  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
+                 !strconcat(OpcStr, " $rs1, $shcnt, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))]>;
 }
 
 class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
@@ -190,44 +230,101 @@ class F4_1<bits<6> op3, dag outs, dag ins,
             string asmstr, list<dag> pattern>
       : F4<op3, outs, ins, asmstr, pattern> {
 
-  bits<3> cc;
+  bit    intcc;
+  bits<2> cc;
   bits<4> cond;
   bits<5> rs2;
 
   let Inst{4-0}   = rs2;
-  let Inst{11}    = cc{0};
-  let Inst{12}    = cc{1};
+  let Inst{12-11} = cc;
   let Inst{13}    = 0;
   let Inst{17-14} = cond;
-  let Inst{18}    = cc{2};
+  let Inst{18}    = intcc;
 
 }
 
 class F4_2<bits<6> op3, dag outs, dag ins,
             string asmstr, list<dag> pattern>
       : F4<op3, outs, ins, asmstr, pattern> {
-  bits<3>  cc;
+  bit      intcc;
+  bits<2>  cc;
   bits<4>  cond;
   bits<11> simm11;
 
   let Inst{10-0}  = simm11;
-  let Inst{11}    = cc{0};
-  let Inst{12}    = cc{1};
+  let Inst{12-11} = cc;
   let Inst{13}    = 1;
   let Inst{17-14} = cond;
-  let Inst{18}    = cc{2};
+  let Inst{18}    = intcc;
 }
 
 class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
            string asmstr, list<dag> pattern>
       : F4<op3, outs, ins, asmstr, pattern> {
   bits<4> cond;
-  bits<3> opf_cc;
+  bit     intcc;
+  bits<2> opf_cc;
   bits<5> rs2;
 
   let Inst{18}     = 0;
   let Inst{17-14}  = cond;
-  let Inst{13-11}  = opf_cc;
+  let Inst{13}     = intcc;
+  let Inst{12-11}  = opf_cc;
   let Inst{10-5}   = opf_low;
   let Inst{4-0}    = rs2;
 }
+
+class F4_4r<bits<6> op3, bits<5> opf_low, bits<3> rcond, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+       : F4<op3, outs, ins, asmstr, pattern> {
+  bits <5> rs1;
+  bits <5> rs2;
+  let Inst{18-14} = rs1;
+  let Inst{13}    = 0;  // IsImm
+  let Inst{12-10} = rcond;
+  let Inst{9-5}   = opf_low;
+  let Inst{4-0}   = rs2;
+}
+
+
+class F4_4i<bits<6> op3, bits<3> rcond, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+       : F4<op3, outs, ins, asmstr, pattern> {
+  bits<5> rs1;
+  bits<10> simm10;
+  let Inst{18-14} = rs1;
+  let Inst{13}    = 1;  // IsImm
+  let Inst{12-10} = rcond;
+  let Inst{9-0}   = simm10;
+}
+
+
+class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins, string asmstr,
+       list<dag> pattern>: F3<outs, ins, asmstr, pattern> {
+
+   bits<4> cond;
+   bits<2> cc;
+
+   let op = 0b10;
+   let rd{4} = 0;
+   let rd{3-0} = cond;
+   let op3 = op3Val;
+   let Inst{13} = isimm;
+   let Inst{12-11} = cc;
+
+}
+
+class TRAPSPrr<bits<6> op3Val, dag outs, dag ins, string asmstr,
+    list<dag> pattern>: TRAPSP<op3Val, 0, outs, ins, asmstr, pattern> {
+   bits<5> rs2;
+
+   let Inst{10-5} = 0;
+   let Inst{4-0}  = rs2;
+}
+class TRAPSPri<bits<6> op3Val, dag outs, dag ins, string asmstr,
+    list<dag> pattern>: TRAPSP<op3Val, 1, outs, ins, asmstr, pattern> {
+   bits<8> imm;
+
+   let Inst{10-8} = 0;
+   let Inst{7-0}  = imm;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index c10b5b3..abf6c17 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -89,6 +89,8 @@ static bool IsIntegerCC(unsigned CC)
 static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
 {
   switch(CC) {
+  case SPCC::ICC_A:    return SPCC::ICC_N;
+  case SPCC::ICC_N:    return SPCC::ICC_A;
   case SPCC::ICC_NE:   return SPCC::ICC_E;
   case SPCC::ICC_E:    return SPCC::ICC_NE;
   case SPCC::ICC_G:    return SPCC::ICC_LE;
@@ -104,6 +106,8 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
   case SPCC::ICC_VC:   return SPCC::ICC_VS;
   case SPCC::ICC_VS:   return SPCC::ICC_VC;
 
+  case SPCC::FCC_A:    return SPCC::FCC_N;
+  case SPCC::FCC_N:    return SPCC::FCC_A;
   case SPCC::FCC_U:    return SPCC::FCC_O;
   case SPCC::FCC_O:    return SPCC::FCC_U;
   case SPCC::FCC_G:    return SPCC::FCC_ULE;
@@ -154,8 +158,8 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         continue;
       }
 
-      while (llvm::next(I) != MBB.end())
-        llvm::next(I)->eraseFromParent();
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
 
       Cond.clear();
       FBB = 0;
@@ -431,8 +435,9 @@ unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
   MachineBasicBlock::iterator MBBI = FirstMBB.begin();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
 
-  GlobalBaseReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
-
+  const TargetRegisterClass *PtrRC =
+    Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+  GlobalBaseReg = RegInfo.createVirtualRegister(PtrRC);
 
   DebugLoc dl;
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index ef7a114..960261c 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -29,7 +29,8 @@ def Is64Bit : Predicate<"Subtarget.is64Bit()">;
 
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
-def HasV9   : Predicate<"Subtarget.isV9()">;
+def HasV9   : Predicate<"Subtarget.isV9()">,
+              AssemblerPredicate<"FeatureV9">;
 
 // HasNoV9 - This predicate is true when the target doesn't have V9
 // instructions.  Use of this is just a hack for the isel not having proper
@@ -37,7 +38,12 @@ def HasV9   : Predicate<"Subtarget.isV9()">;
 def HasNoV9 : Predicate<"!Subtarget.isV9()">;
 
 // HasVIS - This is true when the target processor has VIS extensions.
-def HasVIS : Predicate<"Subtarget.isVIS()">;
+def HasVIS : Predicate<"Subtarget.isVIS()">,
+             AssemblerPredicate<"FeatureVIS">;
+def HasVIS2 : Predicate<"Subtarget.isVIS2()">,
+             AssemblerPredicate<"FeatureVIS2">;
+def HasVIS3 : Predicate<"Subtarget.isVIS3()">,
+             AssemblerPredicate<"FeatureVIS3">;
 
 // HasHardQuad - This is true when the target processor supports quad floating
 // point instructions.
@@ -76,20 +82,50 @@ def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
 def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
 
 // Address operands
+def SparcMEMrrAsmOperand : AsmOperandClass {
+  let Name = "MEMrr";
+  let ParserMethod = "parseMEMOperand";
+}
+
+def SparcMEMriAsmOperand : AsmOperandClass {
+  let Name = "MEMri";
+  let ParserMethod = "parseMEMOperand";
+}
+
 def MEMrr : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops ptr_rc, ptr_rc);
+  let ParserMatchClass = SparcMEMrrAsmOperand;
 }
 def MEMri : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = SparcMEMriAsmOperand;
 }
 
 def TLSSym : Operand<iPTR>;
 
 // Branch targets have OtherVT type.
-def brtarget : Operand<OtherVT>;
-def calltarget : Operand<i32>;
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+}
+
+def bprtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchPredTargetOpValue";
+}
+
+def bprtarget16 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchOnRegTargetOpValue";
+}
+
+def calltarget : Operand<i32> {
+  let EncoderMethod = "getCallTargetOpValue";
+  let DecoderMethod = "DecodeCall";
+}
+
+def simm13Op : Operand<i32> {
+  let DecoderMethod = "DecodeSIMM13";
+}
 
 // Operand for printing out a condition code.
 let PrintMethod = "printCCOperand" in
@@ -163,7 +199,7 @@ def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                              SDNPVariadic]>;
 
-def getPCX        : Operand<i32> {
+def getPCX        : Operand<iPTR> {
   let PrintMethod = "printGetPCX";
 }
 
@@ -210,26 +246,53 @@ def FCC_O   : FCC_VAL<29>;  // Ordered
 //===----------------------------------------------------------------------===//
 
 /// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
-multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode,
+                 RegisterClass RC, ValueType Ty, Operand immOp> {
   def rr  : F3_1<2, Op3Val,
-                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                 !strconcat(OpcStr, " $b, $c, $dst"),
-                 [(set i32:$dst, (OpNode i32:$b, i32:$c))]>;
+                 (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+                 [(set Ty:$rd, (OpNode Ty:$rs1, Ty:$rs2))]>;
   def ri  : F3_2<2, Op3Val,
-                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $b, $c, $dst"),
-                 [(set i32:$dst, (OpNode i32:$b, (i32 simm13:$c)))]>;
+                 (outs RC:$rd), (ins RC:$rs1, immOp:$simm13),
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"),
+                 [(set Ty:$rd, (OpNode Ty:$rs1, (Ty simm13:$simm13)))]>;
 }
 
 /// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
 /// pattern.
 multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
   def rr  : F3_1<2, Op3Val,
-                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+                 (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
   def ri  : F3_2<2, Op3Val,
-                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+                 (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"), []>;
+}
+
+// Load multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+           RegisterClass RC, ValueType Ty> {
+  def rr  : F3_1<3, Op3Val,
+                 (outs RC:$dst), (ins MEMrr:$addr),
+                 !strconcat(OpcStr, " [$addr], $dst"),
+                 [(set Ty:$dst, (OpNode ADDRrr:$addr))]>;
+  def ri  : F3_2<3, Op3Val,
+                 (outs RC:$dst), (ins MEMri:$addr),
+                 !strconcat(OpcStr, " [$addr], $dst"),
+                 [(set Ty:$dst, (OpNode ADDRri:$addr))]>;
+}
+
+// Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+           RegisterClass RC, ValueType Ty> {
+  def rr  : F3_1<3, Op3Val,
+                 (outs), (ins MEMrr:$addr, RC:$rd),
+                 !strconcat(OpcStr, " $rd, [$addr]"),
+                 [(OpNode Ty:$rd, ADDRrr:$addr)]>;
+  def ri  : F3_2<3, Op3Val,
+                 (outs), (ins MEMri:$addr, RC:$rd),
+                 !strconcat(OpcStr, " $rd, [$addr]"),
+                 [(OpNode Ty:$rd, ADDRri:$addr)]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -238,7 +301,10 @@ multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
 
 // Pseudo instructions.
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstSP<outs, ins, asmstr, pattern>;
+   : InstSP<outs, ins, asmstr, pattern> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
 
 // GETPCX for PIC
 let Defs = [O7] in {
@@ -265,9 +331,12 @@ let hasSideEffects = 1, mayStore = 1 in {
                    [(flushw)]>;
 }
 
+let isBarrier = 1, isTerminator = 1, rd = 0b1000, rs1 = 0, simm13 = 5 in
+  def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
+
 let rd = 0 in
-  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
-                  "unimp $val", []>;
+  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
+                  "unimp $imm22", []>;
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.  This has to handle all
@@ -294,7 +363,7 @@ let Uses = [ICC], usesCustomInserter = 1 in {
             [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
-let usesCustomInserter = 1, Uses = [FCC] in {
+let usesCustomInserter = 1, Uses = [FCC0] in {
 
   def SELECT_CC_Int_FCC
    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
@@ -315,10 +384,19 @@ let usesCustomInserter = 1, Uses = [FCC] in {
             [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
+// JMPL Instruction.
+let isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    DecoderMethod = "DecodeJMPL" in {
+  def JMPLrr: F3_1<2, 0b111000, (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "jmpl $addr, $dst", []>;
+  def JMPLri: F3_2<2, 0b111000, (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "jmpl $addr, $dst", []>;
+}
 
 // Section A.3 - Synthetic Instructions, p. 85
 // special cases of JMPL:
-let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
   let rd = 0, rs1 = 15 in
     def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                    "jmp %o7+$val", [(retflag simm13:$val)]>;
@@ -328,129 +406,47 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
                   "jmp %i7+$val", []>;
 }
 
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1,
+     isBarrier = 1, rd = 0, DecoderMethod = "DecodeReturn" in {
+  def RETTrr : F3_1<2, 0b111001, (outs), (ins MEMrr:$addr),
+                       "rett $addr", []>;
+  def RETTri : F3_2<2, 0b111001, (outs), (ins MEMri:$addr),
+                       "rett $addr", []>;
+}
+
 // Section B.1 - Load Integer Instructions, p. 90
-def LDSBrr : F3_1<3, 0b001001,
-                  (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "ldsb [$addr], $dst",
-                  [(set i32:$dst, (sextloadi8 ADDRrr:$addr))]>;
-def LDSBri : F3_2<3, 0b001001,
-                  (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "ldsb [$addr], $dst",
-                  [(set i32:$dst, (sextloadi8 ADDRri:$addr))]>;
-def LDSHrr : F3_1<3, 0b001010,
-                  (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "ldsh [$addr], $dst",
-                  [(set i32:$dst, (sextloadi16 ADDRrr:$addr))]>;
-def LDSHri : F3_2<3, 0b001010,
-                  (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "ldsh [$addr], $dst",
-                  [(set i32:$dst, (sextloadi16 ADDRri:$addr))]>;
-def LDUBrr : F3_1<3, 0b000001,
-                  (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "ldub [$addr], $dst",
-                  [(set i32:$dst, (zextloadi8 ADDRrr:$addr))]>;
-def LDUBri : F3_2<3, 0b000001,
-                  (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "ldub [$addr], $dst",
-                  [(set i32:$dst, (zextloadi8 ADDRri:$addr))]>;
-def LDUHrr : F3_1<3, 0b000010,
-                  (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "lduh [$addr], $dst",
-                  [(set i32:$dst, (zextloadi16 ADDRrr:$addr))]>;
-def LDUHri : F3_2<3, 0b000010,
-                  (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "lduh [$addr], $dst",
-                  [(set i32:$dst, (zextloadi16 ADDRri:$addr))]>;
-def LDrr   : F3_1<3, 0b000000,
-                  (outs IntRegs:$dst), (ins MEMrr:$addr),
-                  "ld [$addr], $dst",
-                  [(set i32:$dst, (load ADDRrr:$addr))]>;
-def LDri   : F3_2<3, 0b000000,
-                  (outs IntRegs:$dst), (ins MEMri:$addr),
-                  "ld [$addr], $dst",
-                  [(set i32:$dst, (load ADDRri:$addr))]>;
+let DecoderMethod = "DecodeLoadInt" in {
+  defm LDSB : Load<"ldsb", 0b001001, sextloadi8,  IntRegs, i32>;
+  defm LDSH : Load<"ldsh", 0b001010, sextloadi16, IntRegs, i32>;
+  defm LDUB : Load<"ldub", 0b000001, zextloadi8,  IntRegs, i32>;
+  defm LDUH : Load<"lduh", 0b000010, zextloadi16, IntRegs, i32>;
+  defm LD   : Load<"ld",   0b000000, load,        IntRegs, i32>;
+}
 
 // Section B.2 - Load Floating-point Instructions, p. 92
-def LDFrr  : F3_1<3, 0b100000,
-                  (outs FPRegs:$dst), (ins MEMrr:$addr),
-                  "ld [$addr], $dst",
-                  [(set f32:$dst, (load ADDRrr:$addr))]>;
-def LDFri  : F3_2<3, 0b100000,
-                  (outs FPRegs:$dst), (ins MEMri:$addr),
-                  "ld [$addr], $dst",
-                  [(set f32:$dst, (load ADDRri:$addr))]>;
-def LDDFrr : F3_1<3, 0b100011,
-                  (outs DFPRegs:$dst), (ins MEMrr:$addr),
-                  "ldd [$addr], $dst",
-                  [(set f64:$dst, (load ADDRrr:$addr))]>;
-def LDDFri : F3_2<3, 0b100011,
-                  (outs DFPRegs:$dst), (ins MEMri:$addr),
-                  "ldd [$addr], $dst",
-                  [(set f64:$dst, (load ADDRri:$addr))]>;
-def LDQFrr : F3_1<3, 0b100010,
-                  (outs QFPRegs:$dst), (ins MEMrr:$addr),
-                  "ldq [$addr], $dst",
-                  [(set f128:$dst, (load ADDRrr:$addr))]>,
-                  Requires<[HasV9, HasHardQuad]>;
-def LDQFri : F3_2<3, 0b100010,
-                  (outs QFPRegs:$dst), (ins MEMri:$addr),
-                  "ldq [$addr], $dst",
-                  [(set f128:$dst, (load ADDRri:$addr))]>,
-                  Requires<[HasV9, HasHardQuad]>;
+let DecoderMethod = "DecodeLoadFP" in
+  defm LDF   : Load<"ld",  0b100000, load, FPRegs,  f32>;
+let DecoderMethod = "DecodeLoadDFP" in
+  defm LDDF  : Load<"ldd", 0b100011, load, DFPRegs, f64>;
+let DecoderMethod = "DecodeLoadQFP" in
+  defm LDQF  : Load<"ldq", 0b100010, load, QFPRegs, f128>,
+               Requires<[HasV9, HasHardQuad]>;
 
 // Section B.4 - Store Integer Instructions, p. 95
-def STBrr : F3_1<3, 0b000101,
-                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
-                 "stb $rd, [$addr]",
-                 [(truncstorei8 i32:$rd, ADDRrr:$addr)]>;
-def STBri : F3_2<3, 0b000101,
-                 (outs), (ins MEMri:$addr, IntRegs:$rd),
-                 "stb $rd, [$addr]",
-                 [(truncstorei8 i32:$rd, ADDRri:$addr)]>;
-def STHrr : F3_1<3, 0b000110,
-                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
-                 "sth $rd, [$addr]",
-                 [(truncstorei16 i32:$rd, ADDRrr:$addr)]>;
-def STHri : F3_2<3, 0b000110,
-                 (outs), (ins MEMri:$addr, IntRegs:$rd),
-                 "sth $rd, [$addr]",
-                 [(truncstorei16 i32:$rd, ADDRri:$addr)]>;
-def STrr  : F3_1<3, 0b000100,
-                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
-                 "st $rd, [$addr]",
-                 [(store i32:$rd, ADDRrr:$addr)]>;
-def STri  : F3_2<3, 0b000100,
-                 (outs), (ins MEMri:$addr, IntRegs:$rd),
-                 "st $rd, [$addr]",
-                 [(store i32:$rd, ADDRri:$addr)]>;
+let DecoderMethod = "DecodeStoreInt" in {
+  defm STB   : Store<"stb", 0b000101, truncstorei8,  IntRegs, i32>;
+  defm STH   : Store<"sth", 0b000110, truncstorei16, IntRegs, i32>;
+  defm ST    : Store<"st",  0b000100, store,         IntRegs, i32>;
+}
 
 // Section B.5 - Store Floating-point Instructions, p. 97
-def STFrr   : F3_1<3, 0b100100,
-                   (outs), (ins MEMrr:$addr, FPRegs:$rd),
-                   "st $rd, [$addr]",
-                   [(store f32:$rd, ADDRrr:$addr)]>;
-def STFri   : F3_2<3, 0b100100,
-                   (outs), (ins MEMri:$addr, FPRegs:$rd),
-                   "st $rd, [$addr]",
-                   [(store f32:$rd, ADDRri:$addr)]>;
-def STDFrr  : F3_1<3, 0b100111,
-                   (outs), (ins MEMrr:$addr, DFPRegs:$rd),
-                   "std  $rd, [$addr]",
-                   [(store f64:$rd, ADDRrr:$addr)]>;
-def STDFri  : F3_2<3, 0b100111,
-                   (outs), (ins MEMri:$addr, DFPRegs:$rd),
-                   "std $rd, [$addr]",
-                   [(store f64:$rd, ADDRri:$addr)]>;
-def STQFrr  : F3_1<3, 0b100110,
-                   (outs), (ins MEMrr:$addr, QFPRegs:$rd),
-                   "stq  $rd, [$addr]",
-                   [(store f128:$rd, ADDRrr:$addr)]>,
-                   Requires<[HasV9, HasHardQuad]>;
-def STQFri  : F3_2<3, 0b100110,
-                   (outs), (ins MEMri:$addr, QFPRegs:$rd),
-                   "stq $rd, [$addr]",
-                   [(store f128:$rd, ADDRri:$addr)]>,
-                   Requires<[HasV9, HasHardQuad]>;
+let DecoderMethod = "DecodeStoreFP" in
+  defm STF   : Store<"st",  0b100100, store,         FPRegs,  f32>;
+let DecoderMethod = "DecodeStoreDFP" in
+  defm STDF  : Store<"std", 0b100111, store,         DFPRegs, f64>;
+let DecoderMethod = "DecodeStoreQFP" in
+  defm STQF  : Store<"stq", 0b100110, store,         QFPRegs, f128>,
+               Requires<[HasV9, HasHardQuad]>;
 
 // Section B.9 - SETHI Instruction, p. 104
 def SETHIi: F2_1<0b100,
@@ -464,85 +460,99 @@ let rd = 0, imm22 = 0 in
   def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
 
 // Section B.11 - Logical Instructions, p. 106
-defm AND    : F3_12<"and", 0b000001, and>;
+defm AND    : F3_12<"and", 0b000001, and, IntRegs, i32, simm13Op>;
 
 def ANDNrr  : F3_1<2, 0b000101,
-                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                   "andn $b, $c, $dst",
-                   [(set i32:$dst, (and i32:$b, (not i32:$c)))]>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "andn $rs1, $rs2, $rd",
+                   [(set i32:$rd, (and i32:$rs1, (not i32:$rs2)))]>;
 def ANDNri  : F3_2<2, 0b000101,
-                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
-                   "andn $b, $c, $dst", []>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "andn $rs1, $simm13, $rd", []>;
 
-defm OR     : F3_12<"or", 0b000010, or>;
+defm OR     : F3_12<"or", 0b000010, or, IntRegs, i32, simm13Op>;
 
 def ORNrr   : F3_1<2, 0b000110,
-                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                   "orn $b, $c, $dst",
-                   [(set i32:$dst, (or i32:$b, (not i32:$c)))]>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "orn $rs1, $rs2, $rd",
+                   [(set i32:$rd, (or i32:$rs1, (not i32:$rs2)))]>;
 def ORNri   : F3_2<2, 0b000110,
-                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
-                   "orn $b, $c, $dst", []>;
-defm XOR    : F3_12<"xor", 0b000011, xor>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "orn $rs1, $simm13, $rd", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor, IntRegs, i32, simm13Op>;
 
 def XNORrr  : F3_1<2, 0b000111,
-                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                   "xnor $b, $c, $dst",
-                   [(set i32:$dst, (not (xor i32:$b, i32:$c)))]>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "xnor $rs1, $rs2, $rd",
+                   [(set i32:$rd, (not (xor i32:$rs1, i32:$rs2)))]>;
 def XNORri  : F3_2<2, 0b000111,
-                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
-                   "xnor $b, $c, $dst", []>;
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "xnor $rs1, $simm13, $rd", []>;
+
+let Defs = [ICC] in {
+  defm ANDCC  : F3_12np<"andcc",  0b010001>;
+  defm ANDNCC : F3_12np<"andncc", 0b010101>;
+  defm ORCC   : F3_12np<"orcc",   0b010010>;
+  defm ORNCC  : F3_12np<"orncc",  0b010110>;
+  defm XORCC  : F3_12np<"xorcc",  0b010011>;
+  defm XNORCC : F3_12np<"xnorcc", 0b010111>;
+}
 
 // Section B.12 - Shift Instructions, p. 107
-defm SLL : F3_12<"sll", 0b100101, shl>;
-defm SRL : F3_12<"srl", 0b100110, srl>;
-defm SRA : F3_12<"sra", 0b100111, sra>;
+defm SLL : F3_12<"sll", 0b100101, shl, IntRegs, i32, simm13Op>;
+defm SRL : F3_12<"srl", 0b100110, srl, IntRegs, i32, simm13Op>;
+defm SRA : F3_12<"sra", 0b100111, sra, IntRegs, i32, simm13Op>;
 
 // Section B.13 - Add Instructions, p. 108
-defm ADD   : F3_12<"add", 0b000000, add>;
+defm ADD   : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
 
 // "LEA" forms of add (patterns to make tblgen happy)
-let Predicates = [Is32Bit] in
+let Predicates = [Is32Bit], isCodeGenOnly = 1 in
   def LEA_ADDri   : F3_2<2, 0b000000,
                      (outs IntRegs:$dst), (ins MEMri:$addr),
                      "add ${addr:arith}, $dst",
                      [(set iPTR:$dst, ADDRri:$addr)]>;
 
 let Defs = [ICC] in
-  defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+  defm ADDCC  : F3_12<"addcc", 0b010000, addc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+  defm ADDC   : F3_12np<"addx", 0b001000>;
 
 let Uses = [ICC], Defs = [ICC] in
-  defm ADDX  : F3_12<"addxcc", 0b011000, adde>;
+  defm ADDE  : F3_12<"addxcc", 0b011000, adde, IntRegs, i32, simm13Op>;
 
 // Section B.15 - Subtract Instructions, p. 110
-defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
+defm SUB    : F3_12  <"sub"  , 0b000100, sub, IntRegs, i32, simm13Op>;
 let Uses = [ICC], Defs = [ICC] in
-  defm SUBX   : F3_12  <"subxcc" , 0b011100, sube>;
+  defm SUBE   : F3_12  <"subxcc" , 0b011100, sube, IntRegs, i32, simm13Op>;
 
 let Defs = [ICC] in
-  defm SUBCC  : F3_12  <"subcc", 0b010100, subc>;
+  defm SUBCC  : F3_12  <"subcc", 0b010100, subc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+  defm SUBC   : F3_12np <"subx", 0b001100>;
 
 let Defs = [ICC], rd = 0 in {
   def CMPrr   : F3_1<2, 0b010100,
-                     (outs), (ins IntRegs:$b, IntRegs:$c),
-                     "cmp $b, $c",
-                     [(SPcmpicc i32:$b, i32:$c)]>;
+                     (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+                     "cmp $rs1, $rs2",
+                     [(SPcmpicc i32:$rs1, i32:$rs2)]>;
   def CMPri   : F3_2<2, 0b010100,
-                     (outs), (ins IntRegs:$b, i32imm:$c),
-                     "cmp $b, $c",
-                     [(SPcmpicc i32:$b, (i32 simm13:$c))]>;
+                     (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+                     "cmp $rs1, $simm13",
+                     [(SPcmpicc i32:$rs1, (i32 simm13:$simm13))]>;
 }
 
-let Uses = [ICC], Defs = [ICC] in
-  def SUBXCCrr: F3_1<2, 0b011100,
-                (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
-                "subxcc $b, $c, $dst", []>;
-
-
 // Section B.18 - Multiply Instructions, p. 113
 let Defs = [Y] in {
   defm UMUL : F3_12np<"umul", 0b001010>;
-  defm SMUL : F3_12  <"smul", 0b001011, mul>;
+  defm SMUL : F3_12  <"smul", 0b001011, mul, IntRegs, i32, simm13Op>;
+}
+
+let Defs = [Y, ICC] in {
+  defm UMULCC : F3_12np<"umulcc", 0b011010>;
+  defm SMULCC : F3_12np<"smulcc", 0b011011>;
 }
 
 // Section B.19 - Divide Instructions, p. 115
@@ -551,6 +561,11 @@ let Defs = [Y] in {
   defm SDIV : F3_12np<"sdiv", 0b001111>;
 }
 
+let Defs = [Y, ICC] in {
+  defm UDIVCC : F3_12np<"udivcc", 0b011110>;
+  defm SDIVCC : F3_12np<"sdivcc", 0b011111>;
+}
+
 // Section B.20 - SAVE and RESTORE, p. 117
 defm SAVE    : F3_12np<"save"   , 0b111100>;
 defm RESTORE : F3_12np<"restore", 0b111101>;
@@ -559,7 +574,7 @@ defm RESTORE : F3_12np<"restore", 0b111101>;
 
 // unconditional branch class.
 class BranchAlways<dag ins, string asmstr, list<dag> pattern>
-  : F2_2<0b010, (outs), ins, asmstr, pattern> {
+  : F2_2<0b010, 0, (outs), ins, asmstr, pattern> {
   let isBranch     = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -569,18 +584,39 @@ class BranchAlways<dag ins, string asmstr, list<dag> pattern>
 let cond = 8 in
   def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
 
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
 // conditional branch class:
 class BranchSP<dag ins, string asmstr, list<dag> pattern>
- : F2_2<0b010, (outs), ins, asmstr, pattern> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let hasDelaySlot = 1;
+ : F2_2<0b010, 0, (outs), ins, asmstr, pattern>;
+
+// conditional branch with annul class:
+class BranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 1, (outs), ins, asmstr, pattern>;
+
+// Conditional branch class on %icc|%xcc with predication:
+multiclass IPredBranch<string regstr, list<dag> CCPattern> {
+  def CC    : F2_3<0b001, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                  !strconcat("b$cond ", !strconcat(regstr, ", $imm19")),
+                   CCPattern>;
+  def CCA   : F2_3<0b001, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                  !strconcat("b$cond,a ", !strconcat(regstr, ", $imm19")),
+                   []>;
+  def CCNT  : F2_3<0b001, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,pn ", !strconcat(regstr, ", $imm19")),
+                   []>;
+  def CCANT : F2_3<0b001, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,a,pn ", !strconcat(regstr, ", $imm19")),
+                   []>;
 }
 
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+
 // Indirect branch instructions.
-let isTerminator = 1, isBarrier = 1,
-     hasDelaySlot = 1, isBranch =1,
-     isIndirectBranch = 1, rd = 0 in {
+let isTerminator = 1, isBarrier = 1,  hasDelaySlot = 1, isBranch =1,
+     isIndirectBranch = 1, rd = 0, isCodeGenOnly = 1 in {
   def BINDrr  : F3_1<2, 0b111000,
                    (outs), (ins MEMrr:$ptr),
                    "jmp $ptr",
@@ -591,47 +627,80 @@ let isTerminator = 1, isBarrier = 1,
                    [(brind ADDRri:$ptr)]>;
 }
 
-let Uses = [ICC] in
+let Uses = [ICC] in {
   def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
                          "b$cond $imm22",
                         [(SPbricc bb:$imm22, imm:$cond)]>;
+  def BCONDA : BranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond,a $imm22", []>;
+
+  let Predicates = [HasV9], cc = 0b00 in
+    defm BPI : IPredBranch<"%icc", []>;
+}
 
 // Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
 
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
 // floating-point conditional branch class:
 class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
- : F2_2<0b110, (outs), ins, asmstr, pattern> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let hasDelaySlot = 1;
+ : F2_2<0b110, 0, (outs), ins, asmstr, pattern>;
+
+// floating-point conditional branch with annul class:
+class FPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 1, (outs), ins, asmstr, pattern>;
+
+// Conditional branch class on %fcc0-%fcc3 with predication:
+multiclass FPredBranch {
+  def CC    : F2_3<0b101, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond $cc, $imm19", []>;
+  def CCA   : F2_3<0b101, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,a $cc, $imm19", []>;
+  def CCNT  : F2_3<0b101, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,pn $cc, $imm19", []>;
+  def CCANT : F2_3<0b101, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,a,pn $cc, $imm19", []>;
 }
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
 
-let Uses = [FCC] in
+let Uses = [FCC0] in {
   def FBCOND  : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
                               "fb$cond $imm22",
                               [(SPbrfcc bb:$imm22, imm:$cond)]>;
+  def FBCONDA : FPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                             "fb$cond,a $imm22", []>;
+}
+
+let Predicates = [HasV9] in
+  defm BPF : FPredBranch;
 
 
 // Section B.24 - Call and Link Instruction, p. 125
 // This is the only Format 1 instruction
 let Uses = [O6],
     hasDelaySlot = 1, isCall = 1 in {
-  def CALL : InstSP<(outs), (ins calltarget:$dst, variable_ops),
-                    "call $dst", []> {
+  def CALL : InstSP<(outs), (ins calltarget:$disp, variable_ops),
+                    "call $disp", []> {
     bits<30> disp;
     let op = 1;
     let Inst{29-0} = disp;
   }
 
-  // indirect calls
-  def JMPLrr : F3_1<2, 0b111000,
-                    (outs), (ins MEMrr:$ptr, variable_ops),
-                    "call $ptr",
-                    [(call ADDRrr:$ptr)]> { let rd = 15; }
-  def JMPLri : F3_2<2, 0b111000,
-                    (outs), (ins MEMri:$ptr, variable_ops),
-                    "call $ptr",
-                    [(call ADDRri:$ptr)]> { let rd = 15; }
+  // indirect calls: special cases of JMPL.
+  let isCodeGenOnly = 1, rd = 15 in {
+    def CALLrr : F3_1<2, 0b111000,
+                      (outs), (ins MEMrr:$ptr, variable_ops),
+                      "call $ptr",
+                      [(call ADDRrr:$ptr)]>;
+    def CALLri : F3_2<2, 0b111000,
+                      (outs), (ins MEMri:$ptr, variable_ops),
+                      "call $ptr",
+                      [(call ADDRri:$ptr)]>;
+  }
 }
 
 // Section B.28 - Read State Register Instructions
@@ -643,172 +712,172 @@ let Uses = [Y], rs1 = 0, rs2 = 0 in
 // Section B.29 - Write State Register Instructions
 let Defs = [Y], rd = 0 in {
   def WRYrr : F3_1<2, 0b110000,
-                   (outs), (ins IntRegs:$b, IntRegs:$c),
-                   "wr $b, $c, %y", []>;
+                   (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "wr $rs1, $rs2, %y", []>;
   def WRYri : F3_2<2, 0b110000,
-                   (outs), (ins IntRegs:$b, i32imm:$c),
-                   "wr $b, $c, %y", []>;
+                   (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "wr $rs1, $simm13, %y", []>;
 }
 // Convert Integer to Floating-point Instructions, p. 141
 def FITOS : F3_3u<2, 0b110100, 0b011000100,
-                 (outs FPRegs:$dst), (ins FPRegs:$src),
-                 "fitos $src, $dst",
-                 [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitos $rs2, $rd",
+                 [(set FPRegs:$rd, (SPitof FPRegs:$rs2))]>;
 def FITOD : F3_3u<2, 0b110100, 0b011001000,
-                 (outs DFPRegs:$dst), (ins FPRegs:$src),
-                 "fitod $src, $dst",
-                 [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitod $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPitof FPRegs:$rs2))]>;
 def FITOQ : F3_3u<2, 0b110100, 0b011001100,
-                 (outs QFPRegs:$dst), (ins FPRegs:$src),
-                 "fitoq $src, $dst",
-                 [(set QFPRegs:$dst, (SPitof FPRegs:$src))]>,
+                 (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitoq $rs2, $rd",
+                 [(set QFPRegs:$rd, (SPitof FPRegs:$rs2))]>,
                  Requires<[HasHardQuad]>;
 
 // Convert Floating-point to Integer Instructions, p. 142
 def FSTOI : F3_3u<2, 0b110100, 0b011010001,
-                 (outs FPRegs:$dst), (ins FPRegs:$src),
-                 "fstoi $src, $dst",
-                 [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi FPRegs:$rs2))]>;
 def FDTOI : F3_3u<2, 0b110100, 0b011010010,
-                 (outs FPRegs:$dst), (ins DFPRegs:$src),
-                 "fdtoi $src, $dst",
-                 [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi DFPRegs:$rs2))]>;
 def FQTOI : F3_3u<2, 0b110100, 0b011010011,
-                 (outs FPRegs:$dst), (ins QFPRegs:$src),
-                 "fqtoi $src, $dst",
-                 [(set FPRegs:$dst, (SPftoi QFPRegs:$src))]>,
+                 (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi QFPRegs:$rs2))]>,
                  Requires<[HasHardQuad]>;
 
 // Convert between Floating-point Formats Instructions, p. 143
 def FSTOD : F3_3u<2, 0b110100, 0b011001001,
-                 (outs DFPRegs:$dst), (ins FPRegs:$src),
-                 "fstod $src, $dst",
-                 [(set f64:$dst, (fextend f32:$src))]>;
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstod $rs2, $rd",
+                 [(set f64:$rd, (fextend f32:$rs2))]>;
 def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
-                 (outs QFPRegs:$dst), (ins FPRegs:$src),
-                 "fstoq $src, $dst",
-                 [(set f128:$dst, (fextend f32:$src))]>,
+                 (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstoq $rs2, $rd",
+                 [(set f128:$rd, (fextend f32:$rs2))]>,
                  Requires<[HasHardQuad]>;
 def FDTOS : F3_3u<2, 0b110100, 0b011000110,
-                 (outs FPRegs:$dst), (ins DFPRegs:$src),
-                 "fdtos $src, $dst",
-                 [(set f32:$dst, (fround f64:$src))]>;
-def FDTOQ : F3_3u<2, 0b110100, 0b01101110,
-                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
-                 "fdtoq $src, $dst",
-                 [(set f128:$dst, (fextend f64:$src))]>,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtos $rs2, $rd",
+                 [(set f32:$rd, (fround f64:$rs2))]>;
+def FDTOQ : F3_3u<2, 0b110100, 0b011001110,
+                 (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtoq $rs2, $rd",
+                 [(set f128:$rd, (fextend f64:$rs2))]>,
                  Requires<[HasHardQuad]>;
 def FQTOS : F3_3u<2, 0b110100, 0b011000111,
-                 (outs FPRegs:$dst), (ins QFPRegs:$src),
-                 "fqtos $src, $dst",
-                 [(set f32:$dst, (fround f128:$src))]>,
+                 (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtos $rs2, $rd",
+                 [(set f32:$rd, (fround f128:$rs2))]>,
                  Requires<[HasHardQuad]>;
 def FQTOD : F3_3u<2, 0b110100, 0b011001011,
-                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
-                 "fqtod $src, $dst",
-                 [(set f64:$dst, (fround f128:$src))]>,
+                 (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtod $rs2, $rd",
+                 [(set f64:$rd, (fround f128:$rs2))]>,
                  Requires<[HasHardQuad]>;
 
 // Floating-point Move Instructions, p. 144
 def FMOVS : F3_3u<2, 0b110100, 0b000000001,
-                 (outs FPRegs:$dst), (ins FPRegs:$src),
-                 "fmovs $src, $dst", []>;
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fmovs $rs2, $rd", []>;
 def FNEGS : F3_3u<2, 0b110100, 0b000000101,
-                 (outs FPRegs:$dst), (ins FPRegs:$src),
-                 "fnegs $src, $dst",
-                 [(set f32:$dst, (fneg f32:$src))]>;
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fnegs $rs2, $rd",
+                 [(set f32:$rd, (fneg f32:$rs2))]>;
 def FABSS : F3_3u<2, 0b110100, 0b000001001,
-                 (outs FPRegs:$dst), (ins FPRegs:$src),
-                 "fabss $src, $dst",
-                 [(set f32:$dst, (fabs f32:$src))]>;
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fabss $rs2, $rd",
+                 [(set f32:$rd, (fabs f32:$rs2))]>;
 
 
 // Floating-point Square Root Instructions, p.145
 def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
-                  (outs FPRegs:$dst), (ins FPRegs:$src),
-                  "fsqrts $src, $dst",
-                  [(set f32:$dst, (fsqrt f32:$src))]>;
+                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                  "fsqrts $rs2, $rd",
+                  [(set f32:$rd, (fsqrt f32:$rs2))]>;
 def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
-                  (outs DFPRegs:$dst), (ins DFPRegs:$src),
-                  "fsqrtd $src, $dst",
-                  [(set f64:$dst, (fsqrt f64:$src))]>;
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                  "fsqrtd $rs2, $rd",
+                  [(set f64:$rd, (fsqrt f64:$rs2))]>;
 def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
-                  (outs QFPRegs:$dst), (ins QFPRegs:$src),
-                  "fsqrtq $src, $dst",
-                  [(set f128:$dst, (fsqrt f128:$src))]>,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                  "fsqrtq $rs2, $rd",
+                  [(set f128:$rd, (fsqrt f128:$rs2))]>,
                   Requires<[HasHardQuad]>;
 
 
 
 // Floating-point Add and Subtract Instructions, p. 146
 def FADDS  : F3_3<2, 0b110100, 0b001000001,
-                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
-                  "fadds $src1, $src2, $dst",
-                  [(set f32:$dst, (fadd f32:$src1, f32:$src2))]>;
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fadds $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fadd f32:$rs1, f32:$rs2))]>;
 def FADDD  : F3_3<2, 0b110100, 0b001000010,
-                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
-                  "faddd $src1, $src2, $dst",
-                  [(set f64:$dst, (fadd f64:$src1, f64:$src2))]>;
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "faddd $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fadd f64:$rs1, f64:$rs2))]>;
 def FADDQ  : F3_3<2, 0b110100, 0b001000011,
-                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
-                  "faddq $src1, $src2, $dst",
-                  [(set f128:$dst, (fadd f128:$src1, f128:$src2))]>,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "faddq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fadd f128:$rs1, f128:$rs2))]>,
                   Requires<[HasHardQuad]>;
 
 def FSUBS  : F3_3<2, 0b110100, 0b001000101,
-                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
-                  "fsubs $src1, $src2, $dst",
-                  [(set f32:$dst, (fsub f32:$src1, f32:$src2))]>;
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fsubs $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fsub f32:$rs1, f32:$rs2))]>;
 def FSUBD  : F3_3<2, 0b110100, 0b001000110,
-                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
-                  "fsubd $src1, $src2, $dst",
-                  [(set f64:$dst, (fsub f64:$src1, f64:$src2))]>;
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fsubd $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fsub f64:$rs1, f64:$rs2))]>;
 def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
-                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
-                  "fsubq $src1, $src2, $dst",
-                  [(set f128:$dst, (fsub f128:$src1, f128:$src2))]>,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "fsubq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fsub f128:$rs1, f128:$rs2))]>,
                   Requires<[HasHardQuad]>;
 
 
 // Floating-point Multiply and Divide Instructions, p. 147
 def FMULS  : F3_3<2, 0b110100, 0b001001001,
-                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
-                  "fmuls $src1, $src2, $dst",
-                  [(set f32:$dst, (fmul f32:$src1, f32:$src2))]>;
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fmuls $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fmul f32:$rs1, f32:$rs2))]>;
 def FMULD  : F3_3<2, 0b110100, 0b001001010,
-                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
-                  "fmuld $src1, $src2, $dst",
-                  [(set f64:$dst, (fmul f64:$src1, f64:$src2))]>;
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fmuld $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fmul f64:$rs1, f64:$rs2))]>;
 def FMULQ  : F3_3<2, 0b110100, 0b001001011,
-                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
-                  "fmulq $src1, $src2, $dst",
-                  [(set f128:$dst, (fmul f128:$src1, f128:$src2))]>,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "fmulq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fmul f128:$rs1, f128:$rs2))]>,
                   Requires<[HasHardQuad]>;
 
 def FSMULD : F3_3<2, 0b110100, 0b001101001,
-                  (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
-                  "fsmuld $src1, $src2, $dst",
-                  [(set f64:$dst, (fmul (fextend f32:$src1),
-                                        (fextend f32:$src2)))]>;
+                  (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fsmuld $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fmul (fextend f32:$rs1),
+                                        (fextend f32:$rs2)))]>;
 def FDMULQ : F3_3<2, 0b110100, 0b001101110,
-                  (outs QFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
-                  "fdmulq $src1, $src2, $dst",
-                  [(set f128:$dst, (fmul (fextend f64:$src1),
-                                         (fextend f64:$src2)))]>,
+                  (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fdmulq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fmul (fextend f64:$rs1),
+                                         (fextend f64:$rs2)))]>,
                   Requires<[HasHardQuad]>;
 
 def FDIVS  : F3_3<2, 0b110100, 0b001001101,
-                 (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
-                 "fdivs $src1, $src2, $dst",
-                 [(set f32:$dst, (fdiv f32:$src1, f32:$src2))]>;
+                 (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                 "fdivs $rs1, $rs2, $rd",
+                 [(set f32:$rd, (fdiv f32:$rs1, f32:$rs2))]>;
 def FDIVD  : F3_3<2, 0b110100, 0b001001110,
-                 (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
-                 "fdivd $src1, $src2, $dst",
-                 [(set f64:$dst, (fdiv f64:$src1, f64:$src2))]>;
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                 "fdivd $rs1, $rs2, $rd",
+                 [(set f64:$rd, (fdiv f64:$rs1, f64:$rs2))]>;
 def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
-                 (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
-                 "fdivq $src1, $src2, $dst",
-                 [(set f128:$dst, (fdiv f128:$src1, f128:$src2))]>,
+                 (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                 "fdivq $rs1, $rs2, $rd",
+                 [(set f128:$rd, (fdiv f128:$rs1, f128:$rs2))]>,
                  Requires<[HasHardQuad]>;
 
 // Floating-point Compare Instructions, p. 148
@@ -818,26 +887,26 @@ def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
 // This behavior is modeled with a forced noop after the instruction in
 // DelaySlotFiller.
 
-let Defs = [FCC] in {
+let Defs = [FCC0], rd = 0, isCodeGenOnly = 1 in {
   def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
-                   (outs), (ins FPRegs:$src1, FPRegs:$src2),
-                   "fcmps $src1, $src2",
-                   [(SPcmpfcc f32:$src1, f32:$src2)]>;
+                   (outs), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fcmps $rs1, $rs2",
+                   [(SPcmpfcc f32:$rs1, f32:$rs2)]>;
   def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
-                   (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
-                   "fcmpd $src1, $src2",
-                   [(SPcmpfcc f64:$src1, f64:$src2)]>;
+                   (outs), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fcmpd $rs1, $rs2",
+                   [(SPcmpfcc f64:$rs1, f64:$rs2)]>;
   def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
-                   (outs), (ins QFPRegs:$src1, QFPRegs:$src2),
-                   "fcmpq $src1, $src2",
-                   [(SPcmpfcc f128:$src1, f128:$src2)]>,
+                   (outs), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                   "fcmpq $rs1, $rs2",
+                   [(SPcmpfcc f128:$rs1, f128:$rs2)]>,
                    Requires<[HasHardQuad]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Instructions for Thread Local Storage(TLS).
 //===----------------------------------------------------------------------===//
-
+let isCodeGenOnly = 1, isAsmParserOnly = 1 in {
 def TLS_ADDrr : F3_1<2, 0b000000,
                     (outs IntRegs:$rd),
                     (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
@@ -861,6 +930,7 @@ let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
   let op = 1;
   let Inst{29-0} = disp;
 }
+}
 
 //===----------------------------------------------------------------------===//
 // V9 Instructions
@@ -869,7 +939,7 @@ let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
 // V9 Conditional Moves.
 let Predicates = [HasV9], Constraints = "$f = $rd" in {
   // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
-  let Uses = [ICC], cc = 0b100 in {
+  let Uses = [ICC], intcc = 1, cc = 0b00 in {
     def MOVICCrr
       : F4_1<0b101100, (outs IntRegs:$rd),
              (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
@@ -884,7 +954,7 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
                     (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC], cc = 0b000 in {
+  let Uses = [FCC0], intcc = 0, cc = 0b00 in {
     def MOVFCCrr
       : F4_1<0b101100, (outs IntRegs:$rd),
              (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
@@ -898,7 +968,7 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
                     (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [ICC], opf_cc = 0b100 in {
+  let Uses = [ICC], intcc = 1, opf_cc = 0b00 in {
     def FMOVS_ICC
       : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
              (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
@@ -912,11 +982,12 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
     def FMOVQ_ICC
       : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
                (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
-               "fmovd$cond %icc, $rs2, $rd",
-               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>;
+               "fmovq$cond %icc, $rs2, $rd",
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>,
+               Requires<[HasHardQuad]>;
   }
 
-  let Uses = [FCC], opf_cc = 0b000 in {
+  let Uses = [FCC0], intcc = 0, opf_cc = 0b00 in {
     def FMOVS_FCC
       : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
              (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
@@ -930,8 +1001,9 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
     def FMOVQ_FCC
       : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
              (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
-             "fmovd$cond %fcc0, $rs2, $rd",
-             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>;
+             "fmovq$cond %fcc0, $rs2, $rd",
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>,
+             Requires<[HasHardQuad]>;
   }
 
 }
@@ -939,40 +1011,142 @@ let Predicates = [HasV9], Constraints = "$f = $rd" in {
 // Floating-Point Move Instructions, p. 164 of the V9 manual.
 let Predicates = [HasV9] in {
   def FMOVD : F3_3u<2, 0b110100, 0b000000010,
-                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
-                   "fmovd $src, $dst", []>;
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fmovd $rs2, $rd", []>;
   def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
-                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
-                   "fmovq $src, $dst", []>,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fmovq $rs2, $rd", []>,
                    Requires<[HasHardQuad]>;
   def FNEGD : F3_3u<2, 0b110100, 0b000000110,
-                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
-                   "fnegd $src, $dst",
-                   [(set f64:$dst, (fneg f64:$src))]>;
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fnegd $rs2, $rd",
+                   [(set f64:$rd, (fneg f64:$rs2))]>;
   def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
-                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
-                   "fnegq $src, $dst",
-                   [(set f128:$dst, (fneg f128:$src))]>,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fnegq $rs2, $rd",
+                   [(set f128:$rd, (fneg f128:$rs2))]>,
                    Requires<[HasHardQuad]>;
   def FABSD : F3_3u<2, 0b110100, 0b000001010,
-                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
-                   "fabsd $src, $dst",
-                   [(set f64:$dst, (fabs f64:$src))]>;
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fabsd $rs2, $rd",
+                   [(set f64:$rd, (fabs f64:$rs2))]>;
   def FABSQ : F3_3u<2, 0b110100, 0b000001011,
-                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
-                   "fabsq $src, $dst",
-                   [(set f128:$dst, (fabs f128:$src))]>,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fabsq $rs2, $rd",
+                   [(set f128:$rd, (fabs f128:$rs2))]>,
+                   Requires<[HasHardQuad]>;
+}
+
+// Floating-point compare instruction with %fcc0-%fcc3.
+def V9FCMPS  : F3_3c<2, 0b110101, 0b001010001,
+               (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+               "fcmps $rd, $rs1, $rs2", []>;
+def V9FCMPD  : F3_3c<2, 0b110101, 0b001010010,
+                (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                "fcmpd $rd, $rs1, $rs2", []>;
+def V9FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                "fcmpq $rd, $rs1, $rs2", []>,
+                 Requires<[HasHardQuad]>;
+
+let hasSideEffects = 1 in {
+  def V9FCMPES  : F3_3c<2, 0b110101, 0b001010101,
+                   (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fcmpes $rd, $rs1, $rs2", []>;
+  def V9FCMPED  : F3_3c<2, 0b110101, 0b001010110,
+                   (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fcmped $rd, $rs1, $rs2", []>;
+  def V9FCMPEQ  : F3_3c<2, 0b110101, 0b001010111,
+                   (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                   "fcmpeq $rd, $rs1, $rs2", []>,
                    Requires<[HasHardQuad]>;
 }
 
+// Floating point conditional move instrucitons with %fcc0-%fcc3.
+let Predicates = [HasV9] in {
+  let Constraints = "$f = $rd", intcc = 0 in {
+    def V9MOVFCCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins FCCRegs:$cc, IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond $cc, $rs2, $rd", []>;
+    def V9MOVFCCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins FCCRegs:$cc, i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond $cc, $simm11, $rd", []>;
+    def V9FMOVS_FCC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FCCRegs:$opf_cc, FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond $opf_cc, $rs2, $rd", []>;
+    def V9FMOVD_FCC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins FCCRegs:$opf_cc, DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond $opf_cc, $rs2, $rd", []>;
+    def V9FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins FCCRegs:$opf_cc, QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovq$cond $opf_cc, $rs2, $rd", []>,
+             Requires<[HasHardQuad]>;
+  } // Constraints = "$f = $rd", ...
+} // let Predicates = [hasV9]
+
+
 // POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
-// the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
+// the top 32-bits before using it.  To do this clearing, we use a SRLri X,0.
 let rs1 = 0 in
   def POPCrr : F3_1<2, 0b101110,
                     (outs IntRegs:$dst), (ins IntRegs:$src),
                     "popc $src, $dst", []>, Requires<[HasV9]>;
 def : Pat<(ctpop i32:$src),
-          (POPCrr (SLLri $src, 0))>;
+          (POPCrr (SRLri $src, 0))>;
+
+// Atomic swap.
+let hasSideEffects =1, rd = 0, rs1 = 0b01111, rs2 = 0 in
+  def STBAR : F3_1<2, 0b101000, (outs), (ins), "stbar", []>;
+
+let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
+ def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13),
+                    "membar $simm13", []>;
+
+let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
+  def SWAPrr : F3_1<3, 0b001111,
+                 (outs IntRegs:$dst), (ins MEMrr:$addr, IntRegs:$val),
+                 "swap [$addr], $dst",
+                 [(set i32:$dst, (atomic_swap_32 ADDRrr:$addr, i32:$val))]>;
+  def SWAPri : F3_2<3, 0b001111,
+                 (outs IntRegs:$dst), (ins MEMri:$addr, IntRegs:$val),
+                 "swap [$addr], $dst",
+                 [(set i32:$dst, (atomic_swap_32 ADDRri:$addr, i32:$val))]>;
+}
+
+let Predicates = [HasV9], Constraints = "$swap = $rd" in
+  def CASrr: F3_1_asi<3, 0b111100, 0b10000000,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap),
+                 "cas [$rs1], $rs2, $rd",
+                 [(set i32:$rd,
+                     (atomic_cmp_swap iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+
+let Defs = [ICC] in {
+defm TADDCC   : F3_12np<"taddcc",   0b100000>;
+defm TSUBCC   : F3_12np<"tsubcc",   0b100001>;
+
+let hasSideEffects = 1 in {
+  defm TADDCCTV : F3_12np<"taddcctv", 0b100010>;
+  defm TSUBCCTV : F3_12np<"tsubcctv", 0b100011>;
+}
+}
+
+multiclass TRAP<string regStr> {
+  def rr : TRAPSPrr<0b111010, (outs), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                       CCOp:$cond),
+              !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $rs2"), []>;
+  def ri : TRAPSPri<0b111010, (outs), (ins IntRegs:$rs1, i32imm:$imm,
+                                      CCOp:$cond),
+              !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $imm"), []>;
+}
+
+let hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+  defm TICC : TRAP<"%icc">;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -987,6 +1161,8 @@ def : Pat<(i32 imm:$val),
 
 
 // Global addresses, constant pool entries
+let Predicates = [Is32Bit] in {
+
 def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
 def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
 def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
@@ -1009,6 +1185,7 @@ def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDri $r, tglobaladdr:$in)>;
 def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDri $r, tconstpool:$in)>;
 def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
                         (ADDri $r, tblockaddress:$in)>;
+}
 
 // Calls:
 def : Pat<(call tglobaladdr:$dst),
@@ -1032,4 +1209,19 @@ def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
 def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
 def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
 
+// store bar for all atomic_fence in V8.
+let Predicates = [HasNoV9] in
+  def : Pat<(atomic_fence imm, imm), (STBAR)>;
+
+// atomic_load_32 addr -> load addr
+def : Pat<(i32 (atomic_load ADDRrr:$src)), (LDrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>;
+
+// atomic_store_32 val, addr -> store val, addr
+def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
+
+
 include "SparcInstr64Bit.td"
+include "SparcInstrVIS.td"
+include "SparcInstrAliases.td"
diff --git a/lib/Target/Sparc/SparcInstrVIS.td b/lib/Target/Sparc/SparcInstrVIS.td
new file mode 100644
index 0000000..3e2b49d
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrVIS.td
@@ -0,0 +1,263 @@
+//===---- SparcInstrVIS.td - Visual Instruction Set extensions (VIS) -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction formats, definitions and patterns needed for
+// VIS, VIS II, VIS II instructions on SPARC.
+//===----------------------------------------------------------------------===//
+
+// VIS Instruction Format.
+class VISInstFormat<bits<9> opfval, dag outs, dag ins, string asmstr,
+      list<dag> pattern>
+      : F3_3<0b10, 0b110110, opfval, outs, ins, asmstr, pattern>;
+
+class VISInst<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+        !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// VIS Instruction with integer destination register.
+class VISInstID<bits<9> opfval, string OpcStr>
+       : VISInstFormat<opfval,
+        (outs I64Regs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+        !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// For VIS Instructions with no operand.
+let rd = 0, rs1 = 0, rs2 = 0 in
+class VISInst0<bits<9> opfval, string asmstr>
+       : VISInstFormat<opfval, (outs), (ins), asmstr, []>;
+
+// For VIS Instructions with only rs1, rd operands.
+let rs2 = 0 in
+class VISInst1<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs1),
+        !strconcat(OpcStr, " $rs1, $rd"), []>;
+
+// For VIS Instructions with only rs2, rd operands.
+let rs1 = 0 in
+class VISInst2<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs2),
+        !strconcat(OpcStr, " $rs2, $rd"), []>;
+
+// For VIS Instructions with only rd operand.
+let Constraints = "$rd = $f", rs1 = 0, rs2 = 0 in
+class VISInstD<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$f),
+        !strconcat(OpcStr, " $rd"), []>;
+
+// VIS 1 Instructions
+let Predicates = [HasVIS] in {
+
+def FPADD16     : VISInst<0b001010000, "fpadd16">;
+def FPADD16S    : VISInst<0b001010001, "fpadd16s">;
+def FPADD32     : VISInst<0b001010010, "fpadd32">;
+def FPADD32S    : VISInst<0b001010011, "fpadd32s">;
+def FPSUB16     : VISInst<0b001010100, "fpsub16">;
+def FPSUB16S    : VISInst<0b001010101, "fpsub16S">;
+def FPSUB32     : VISInst<0b001010110, "fpsub32">;
+def FPSUB32S    : VISInst<0b001010111, "fpsub32S">;
+
+def FPACK16     : VISInst2<0b000111011, "fpack16">;
+def FPACK32     : VISInst <0b000111010, "fpack32">;
+def FPACKFIX    : VISInst2<0b000111101, "fpackfix">;
+def FEXPAND     : VISInst2<0b001001101, "fexpand">;
+def FPMERGE     : VISInst <0b001001011, "fpmerge">;
+
+def FMUL8X16    : VISInst<0b00110001, "fmul8x16">;
+def FMUL8X16AU  : VISInst<0b00110011, "fmul8x16au">;
+def FMUL8X16AL  : VISInst<0b00110101, "fmul8x16al">;
+def FMUL8SUX16  : VISInst<0b00110110, "fmul8sux16">;
+def FMUL8ULX16  : VISInst<0b00110111, "fmul8ulx16">;
+def FMULD8SUX16 : VISInst<0b00111000, "fmuld8sux16">;
+def FMULD8ULX16 : VISInst<0b00111001, "fmuld8ulx16">;
+
+def ALIGNADDR   : VISInst<0b000011000, "alignaddr", I64Regs>;
+def ALIGNADDRL  : VISInst<0b000011010, "alignaddrl", I64Regs>;
+def FALIGNADATA : VISInst<0b001001000, "faligndata">;
+
+def FZERO       : VISInstD<0b001100000, "fzero">;
+def FZEROS      : VISInstD<0b001100001, "fzeros", FPRegs>;
+def FONE        : VISInstD<0b001111110, "fone">;
+def FONES       : VISInstD<0b001111111, "fones", FPRegs>;
+def FSRC1       : VISInst1<0b001110100, "fsrc1">;
+def FSRC1S      : VISInst1<0b001110101, "fsrc1s", FPRegs>;
+def FSRC2       : VISInst2<0b001111000, "fsrc2">;
+def FSRC2S      : VISInst2<0b001111001, "fsrc2s", FPRegs>;
+def FNOT1       : VISInst1<0b001101010, "fnot1">;
+def FNOT1S      : VISInst1<0b001101011, "fnot1s", FPRegs>;
+def FNOT2       : VISInst2<0b001100110, "fnot2">;
+def FNOT2S      : VISInst2<0b001100111, "fnot2s", FPRegs>;
+def FOR         : VISInst<0b001111100,  "for">;
+def FORS        : VISInst<0b001111101,  "fors",  FPRegs>;
+def FNOR        : VISInst<0b001100010,  "fnor">;
+def FNORS       : VISInst<0b001100011,  "fnors", FPRegs>;
+def FAND        : VISInst<0b001110000,  "fand">;
+def FANDS       : VISInst<0b001110001,  "fands", FPRegs>;
+def FNAND       : VISInst<0b001101110,  "fnand">;
+def FNANDS      : VISInst<0b001101111,  "fnands", FPRegs>;
+def FXOR        : VISInst<0b001101100,  "fxor">;
+def FXORS       : VISInst<0b001101101,  "fxors", FPRegs>;
+def FXNOR       : VISInst<0b001110010,  "fxnor">;
+def FXNORS      : VISInst<0b001110011,  "fxnors", FPRegs>;
+
+def FORNOT1     : VISInst<0b001111010,  "fornot1">;
+def FORNOT1S    : VISInst<0b001111011,  "fornot1s",  FPRegs>;
+def FORNOT2     : VISInst<0b001110110,  "fornot2">;
+def FORNOT2S    : VISInst<0b001110111,  "fornot2s",  FPRegs>;
+def FANDNOT1    : VISInst<0b001101000,  "fandnot1">;
+def FANDNOT1S   : VISInst<0b001101001,  "fandnot1s", FPRegs>;
+def FANDNOT2    : VISInst<0b001100100,  "fandnot2">;
+def FANDNOT2S   : VISInst<0b001100101,  "fandnot2s", FPRegs>;
+
+def FCMPGT16    : VISInstID<0b000101000,  "fcmpgt16">;
+def FCMPGT32    : VISInstID<0b000101100,  "fcmpgt32">;
+def FCMPLE16    : VISInstID<0b000100000,  "fcmple16">;
+def FCMPLE32    : VISInstID<0b000100100,  "fcmple32">;
+def FCMPNE16    : VISInstID<0b000100010,  "fcmpne16">;
+def FCMPNE32    : VISInstID<0b000100110,  "fcmpne32">;
+def FCMPEQ16    : VISInstID<0b000101010,  "fcmpeq16">;
+def FCMPEQ32    : VISInstID<0b000101110,  "fcmpeq32">;
+
+
+def EDGE8       : VISInst<0b000000000,  "edge8",   I64Regs>;
+def EDGE8L      : VISInst<0b000000010,  "edge8l",  I64Regs>;
+def EDGE16      : VISInst<0b000000100,  "edge16",  I64Regs>;
+def EDGE16L     : VISInst<0b000000110,  "edge16l", I64Regs>;
+def EDGE32      : VISInst<0b000001000,  "edge32",  I64Regs>;
+def EDGE32L     : VISInst<0b000001010,  "edge32l", I64Regs>;
+
+def PDIST       : VISInst<0b00111110, "pdist">;
+
+def ARRAY8      : VISInst<0b000010000, "array8",  I64Regs>;
+def ARRAY16     : VISInst<0b000010010, "array16", I64Regs>;
+def ARRAY32     : VISInst<0b000010100, "array32", I64Regs>;
+
+def SHUTDOWN    : VISInst0<0b010000000, "shutdown">;
+
+} // Predicates = [HasVIS]
+
+
+// VIS 2 Instructions.
+let Predicates = [HasVIS2] in {
+
+def BMASK     : VISInst<0b000011001, "bmask", I64Regs>;
+def BSHUFFLE  : VISInst<0b000011100, "bshuffle">;
+
+def SIAM      : VISInst0<0b010000001, "siam">;
+
+def EDGE8N    : VISInst<0b000000001,  "edge8n",   I64Regs>;
+def EDGE8LN   : VISInst<0b000000011,  "edge8ln",  I64Regs>;
+def EDGE16N   : VISInst<0b000000101,  "edge16n",  I64Regs>;
+def EDGE16LN  : VISInst<0b000000111,  "edge16ln", I64Regs>;
+def EDGE32N   : VISInst<0b000001001,  "edge32n",  I64Regs>;
+def EDGE32LN  : VISInst<0b000001011,  "edge32ln", I64Regs>;
+} // Predicates = [HasVIS2]
+
+
+// VIS 3 Instructions.
+let Predicates = [HasVIS3] in {
+
+let Uses = [ICC] in
+def ADDXC : VISInst<0b000010001, "addxc", I64Regs>;
+
+let Defs = [ICC], Uses = [ICC] in
+def ADDXCCC : VISInst<0b000010011, "addxccc", I64Regs>;
+
+let rd = 0, rs1 = 0 in {
+def CMASK8  : VISInstFormat<0b000011011, (outs), (ins I64Regs:$rs2),
+              "cmask8 $rs2", []>;
+def CMASK16  : VISInstFormat<0b000011101, (outs), (ins I64Regs:$rs2),
+              "cmask16 $rs2", []>;
+def CMASK32  : VISInstFormat<0b000011111, (outs), (ins I64Regs:$rs2),
+              "cmask32 $rs2", []>;
+
+}
+
+def FCHKSM16 : VISInst<0b01000100, "fchksm16">;
+
+def FHADDS   : F3_3<0b10, 0b110100, 0b001100001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhadds $rs1, $rs2, $rd", []>;
+def FHADDD   : F3_3<0b10, 0b110100, 0b001100010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhaddd $rs1, $rs2, $rd", []>;
+def FHSUBS   : F3_3<0b10, 0b110100, 0b001100101,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhsubs $rs1, $rs2, $rd", []>;
+def FHSUBD   : F3_3<0b10, 0b110100, 0b001100110,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhsubd $rs1, $rs2, $rd", []>;
+def FLCMPS   : VISInstFormat<0b101010001, (outs FCCRegs:$rd),
+                     (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                     "flcmps $rd, $rs1, $rs2", []>;
+def FLCMPD   : VISInstFormat<0b101010010, (outs FCCRegs:$rd),
+                     (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                     "flcmpd $rd, $rs1, $rs2", []>;
+
+def FMEAN16  : VISInst<0b001000000, "fmean16">;
+
+def FNADDS   : F3_3<0b10, 0b110100, 0b001010001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnadds $rs1, $rs2, $rd", []>;
+def FNADDD   : F3_3<0b10, 0b110100, 0b001010010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnaddd $rs1, $rs2, $rd", []>;
+def FNHADDS  : F3_3<0b10, 0b110100, 0b001110001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+def FNHADDD  : F3_3<0b10, 0b110100, 0b001110010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhaddd $rs1, $rs2, $rd", []>;
+
+def FNMULS   : F3_3<0b10, 0b110100, 0b001011001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+def FNMULD   : F3_3<0b10, 0b110100, 0b001011010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhaddd $rs1, $rs2, $rd", []>;
+def FNSMULD  : F3_3<0b10, 0b110100, 0b001111001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+
+def FPADD64   : VISInst<0b001000010, "fpadd64">;
+
+def FSLL16    : VISInst<0b00100001, "fsll16">;
+def FSRL16    : VISInst<0b00100011, "fsrl16">;
+def FSLL32    : VISInst<0b00100101, "fsll32">;
+def FSRL32    : VISInst<0b00100111, "fsrl32">;
+def FSLAS16   : VISInst<0b00101001, "fslas16">;
+def FSRA16    : VISInst<0b00101011, "fsra16">;
+def FSLAS32   : VISInst<0b00101101, "fslas32">;
+def FSRA32    : VISInst<0b00101111, "fsra32">;
+
+let rs1 = 0 in
+def LZCNT     : VISInstFormat<0b000010111, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs2), "lzcnt $rs2, $rd", []>;
+
+let rs1 = 0 in {
+def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movstosw $rs2, $rd", []>;
+def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movstouw $rs2, $rd", []>;
+def MOVDTOX  : VISInstFormat<0b100010000, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVWTOS  :  VISInstFormat<0b100011001, (outs DFPRegs:$rd),
+                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVXTOD  :  VISInstFormat<0b100011000, (outs DFPRegs:$rd),
+                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+}
+
+def PDISTN   : VISInst<0b000111111, "pdistn">;
+
+def UMULXHI  : VISInst<0b000010110, "umulxhi", I64Regs>;
+def XMULX    : VISInst<0b100010101, "xmulx",   I64Regs>;
+def XMULXHI  : VISInst<0b100010111, "xmulxhi", I64Regs>;
+} // Predicates = [IsVIS3]
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
index 6493c7d..959d12f 100644
--- a/lib/Target/Sparc/SparcJITInfo.cpp
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -12,8 +12,9 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "jit"
 #include "SparcJITInfo.h"
+#include "Sparc.h"
 #include "SparcRelocations.h"
-
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
 #include "llvm/Support/Memory.h"
 
@@ -27,26 +28,67 @@ extern "C" void SparcCompilationCallback();
 
 extern "C" {
 #if defined (__sparc__)
+
+#if defined(__arch64__)
+#define FRAME_PTR(X) #X "+2047"
+#else
+#define FRAME_PTR(X) #X
+#endif
+
   asm(
       ".text\n"
       "\t.align 4\n"
       "\t.global SparcCompilationCallback\n"
       "\t.type SparcCompilationCallback, #function\n"
       "SparcCompilationCallback:\n"
-      // Save current register window.
-      "\tsave %sp, -192, %sp\n"
-      // stubaddr+4 is in %g1.
+      // Save current register window and create stack.
+      // 128 (save area) + 6*8 (for arguments) + 16*8 (for float regfile) = 304
+      "\tsave %sp, -304, %sp\n"
+      // save float regfile to the stack.
+      "\tstd %f0,  [" FRAME_PTR(%fp) "-0]\n"
+      "\tstd %f2,  [" FRAME_PTR(%fp) "-8]\n"
+      "\tstd %f4,  [" FRAME_PTR(%fp) "-16]\n"
+      "\tstd %f6,  [" FRAME_PTR(%fp) "-24]\n"
+      "\tstd %f8,  [" FRAME_PTR(%fp) "-32]\n"
+      "\tstd %f10, [" FRAME_PTR(%fp) "-40]\n"
+      "\tstd %f12, [" FRAME_PTR(%fp) "-48]\n"
+      "\tstd %f14, [" FRAME_PTR(%fp) "-56]\n"
+      "\tstd %f16, [" FRAME_PTR(%fp) "-64]\n"
+      "\tstd %f18, [" FRAME_PTR(%fp) "-72]\n"
+      "\tstd %f20, [" FRAME_PTR(%fp) "-80]\n"
+      "\tstd %f22, [" FRAME_PTR(%fp) "-88]\n"
+      "\tstd %f24, [" FRAME_PTR(%fp) "-96]\n"
+      "\tstd %f26, [" FRAME_PTR(%fp) "-104]\n"
+      "\tstd %f28, [" FRAME_PTR(%fp) "-112]\n"
+      "\tstd %f30, [" FRAME_PTR(%fp) "-120]\n"
+      // stubaddr is in %g1.
       "\tcall SparcCompilationCallbackC\n"
-      "\t  sub %g1, 4, %o0\n"
+      "\t  mov %g1, %o0\n"
+      // restore float regfile from the stack.
+      "\tldd [" FRAME_PTR(%fp) "-0],   %f0\n"
+      "\tldd [" FRAME_PTR(%fp) "-8],   %f2\n"
+      "\tldd [" FRAME_PTR(%fp) "-16],  %f4\n"
+      "\tldd [" FRAME_PTR(%fp) "-24],  %f6\n"
+      "\tldd [" FRAME_PTR(%fp) "-32],  %f8\n"
+      "\tldd [" FRAME_PTR(%fp) "-40],  %f10\n"
+      "\tldd [" FRAME_PTR(%fp) "-48],  %f12\n"
+      "\tldd [" FRAME_PTR(%fp) "-56],  %f14\n"
+      "\tldd [" FRAME_PTR(%fp) "-64],  %f16\n"
+      "\tldd [" FRAME_PTR(%fp) "-72],  %f18\n"
+      "\tldd [" FRAME_PTR(%fp) "-80],  %f20\n"
+      "\tldd [" FRAME_PTR(%fp) "-88],  %f22\n"
+      "\tldd [" FRAME_PTR(%fp) "-96],  %f24\n"
+      "\tldd [" FRAME_PTR(%fp) "-104], %f26\n"
+      "\tldd [" FRAME_PTR(%fp) "-112], %f28\n"
+      "\tldd [" FRAME_PTR(%fp) "-120], %f30\n"
       // restore original register window and
       // copy %o0 to %g1
-      "\t  restore %o0, 0, %g1\n"
+      "\trestore %o0, 0, %g1\n"
       // call the new stub
       "\tjmp %g1\n"
       "\t  nop\n"
       "\t.size   SparcCompilationCallback, .-SparcCompilationCallback"
       );
-
 #else
   void SparcCompilationCallback() {
     llvm_unreachable(
@@ -55,75 +97,171 @@ extern "C" {
 #endif
 }
 
-#define HI(Val) (((unsigned)(Val)) >> 10)
-#define LO(Val) (((unsigned)(Val)) & 0x3FF)
 
 #define SETHI_INST(imm, rd)    (0x01000000 | ((rd) << 25) | ((imm) & 0x3FFFFF))
 #define JMP_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x38 << 19) \
                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
 #define NOP_INST               SETHI_INST(0, 0)
+#define OR_INST_I(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x02 << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define OR_INST_R(rs1, rs2, rd) (0x80000000 | ((rd) << 25) | (0x02 << 19) \
+                                 | ((rs1) << 14) | (0 << 13) | ((rs2) & 0x1F))
+#define RDPC_INST(rd)           (0x80000000 | ((rd) << 25) | (0x28 << 19) \
+                                 | (5 << 14))
+#define LDX_INST(rs1, imm, rd)  (0xC0000000 | ((rd) << 25) | (0x0B << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define SLLX_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x25 << 19) \
+                                 | ((rs1) << 14) | (3 << 12) | ((imm) & 0x3F))
+#define SUB_INST(rs1, imm, rd)  (0x80000000 | ((rd) << 25) | (0x04 << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define XOR_INST(rs1, imm, rd)  (0x80000000 | ((rd) << 25) | (0x03 << 19) \
+                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define BA_INST(tgt)             (0x10800000 | ((tgt) & 0x3FFFFF))
+
+// Emit instructions to jump to Addr and store the starting address of
+// the instructions emitted in the scratch register.
+static void emitInstrForIndirectJump(intptr_t Addr,
+                                     unsigned scratch,
+                                     SmallVectorImpl<uint32_t> &Insts) {
+
+  if (isInt<13>(Addr)) {
+    // Emit: jmpl %g0+Addr, <scratch>
+    //         nop
+    Insts.push_back(JMP_INST(0, LO10(Addr), scratch));
+    Insts.push_back(NOP_INST);
+    return;
+  }
+
+  if (isUInt<32>(Addr)) {
+    // Emit: sethi %hi(Addr), scratch
+    //       jmpl scratch+%lo(Addr), scratch
+    //         sub scratch, 4, scratch
+    Insts.push_back(SETHI_INST(HI22(Addr), scratch));
+    Insts.push_back(JMP_INST(scratch, LO10(Addr), scratch));
+    Insts.push_back(SUB_INST(scratch, 4, scratch));
+    return;
+  }
+
+  if (Addr < 0 && isInt<33>(Addr)) {
+    // Emit: sethi %hix(Addr), scratch)
+    //       xor   scratch, %lox(Addr), scratch
+    //       jmpl scratch+0, scratch
+    //         sub scratch, 8, scratch
+    Insts.push_back(SETHI_INST(HIX22(Addr), scratch));
+    Insts.push_back(XOR_INST(scratch, LOX10(Addr), scratch));
+    Insts.push_back(JMP_INST(scratch, 0, scratch));
+    Insts.push_back(SUB_INST(scratch, 8, scratch));
+    return;
+  }
+
+  // Emit: rd %pc, scratch
+  //       ldx [scratch+16], scratch
+  //       jmpl scratch+0, scratch
+  //         sub scratch, 8, scratch
+  //       <Addr: 8 byte>
+  Insts.push_back(RDPC_INST(scratch));
+  Insts.push_back(LDX_INST(scratch, 16, scratch));
+  Insts.push_back(JMP_INST(scratch, 0, scratch));
+  Insts.push_back(SUB_INST(scratch, 8, scratch));
+  Insts.push_back((uint32_t)(((int64_t)Addr) >> 32) & 0xffffffff);
+  Insts.push_back((uint32_t)(Addr & 0xffffffff));
+
+  // Instruction sequence without rdpc instruction
+  // 7 instruction and 2 scratch register
+  // Emit: sethi %hh(Addr), scratch
+  //       or scratch, %hm(Addr), scratch
+  //       sllx scratch, 32, scratch
+  //       sethi %hi(Addr), scratch2
+  //       or scratch, scratch2, scratch
+  //       jmpl scratch+%lo(Addr), scratch
+  //         sub scratch, 20, scratch
+  // Insts.push_back(SETHI_INST(HH22(Addr), scratch));
+  // Insts.push_back(OR_INST_I(scratch, HM10(Addr), scratch));
+  // Insts.push_back(SLLX_INST(scratch, 32, scratch));
+  // Insts.push_back(SETHI_INST(HI22(Addr), scratch2));
+  // Insts.push_back(OR_INST_R(scratch, scratch2, scratch));
+  // Insts.push_back(JMP_INST(scratch, LO10(Addr), scratch));
+  // Insts.push_back(SUB_INST(scratch, 20, scratch));
+}
 
 extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
   // Get the address of the compiled code for this function.
   intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
 
   // Rewrite the function stub so that we don't end up here every time we
-  // execute the call. We're replacing the first three instructions of the
-  // stub with code that jumps to the compiled function:
-  //   sethi %hi(NewVal), %g1
-  //   jmp %g1+%lo(NewVal)
-  //   nop
+  // execute the call. We're replacing the stub instructions with code
+  // that jumps to the compiled function:
+
+  SmallVector<uint32_t, 8> Insts;
+  intptr_t diff = (NewVal - StubAddr) >> 2;
+  if (isInt<22>(diff)) {
+    // Use branch instruction to jump
+    Insts.push_back(BA_INST(diff));
+    Insts.push_back(NOP_INST);
+  } else {
+    // Otherwise, use indirect jump to the compiled function
+    emitInstrForIndirectJump(NewVal, 1, Insts);
+  }
 
-  *(intptr_t *)(StubAddr)      = SETHI_INST(HI(NewVal), 1);
-  *(intptr_t *)(StubAddr + 4)  = JMP_INST(1, LO(NewVal), 0);
-  *(intptr_t *)(StubAddr + 8)  = NOP_INST;
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
+    *(uint32_t *)(StubAddr + i*4) = Insts[i];
 
-  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 12);
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, Insts.size() * 4);
   return (void*)StubAddr;
 }
 
+
 void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
   assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction");
 }
 
 
 TargetJITInfo::StubLayout SparcJITInfo::getStubLayout() {
-  // The stub contains 3 4-byte instructions, aligned at 4 bytes. See
-  // emitFunctionStub for details.
-
-  StubLayout Result = { 3*4, 4 };
+  // The stub contains maximum of 4 4-byte instructions and 8 bytes for address,
+  // aligned at 32 bytes.
+  // See emitFunctionStub and emitInstrForIndirectJump for details.
+  StubLayout Result = { 4*4 + 8, 32 };
   return Result;
 }
 
 void *SparcJITInfo::emitFunctionStub(const Function *F, void *Fn,
                                      JITCodeEmitter &JCE)
 {
-  JCE.emitAlignment(4);
+  JCE.emitAlignment(32);
   void *Addr = (void*) (JCE.getCurrentPCValue());
-  if (!sys::Memory::setRangeWritable(Addr, 12))
-    llvm_unreachable("ERROR: Unable to mark stub writable.");
 
+  intptr_t CurrentAddr = (intptr_t)Addr;
   intptr_t EmittedAddr;
-  if (Fn != (void*)(intptr_t)SparcCompilationCallback)
+  SmallVector<uint32_t, 8> Insts;
+  if (Fn != (void*)(intptr_t)SparcCompilationCallback) {
     EmittedAddr = (intptr_t)Fn;
-  else
+    intptr_t diff = (EmittedAddr - CurrentAddr) >> 2;
+    if (isInt<22>(diff)) {
+      Insts.push_back(BA_INST(diff));
+      Insts.push_back(NOP_INST);
+    }
+  } else {
     EmittedAddr = (intptr_t)SparcCompilationCallback;
+  }
+
+  if (Insts.size() == 0)
+    emitInstrForIndirectJump(EmittedAddr, 1, Insts);
 
-  // sethi %hi(EmittedAddr), %g1
-  // jmp   %g1+%lo(EmittedAddr), %g1
-  // nop
 
-  JCE.emitWordBE(SETHI_INST(HI(EmittedAddr), 1));
-  JCE.emitWordBE(JMP_INST(1, LO(EmittedAddr), 1));
-  JCE.emitWordBE(NOP_INST);
+  if (!sys::Memory::setRangeWritable(Addr, 4 * Insts.size()))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
+    JCE.emitWordBE(Insts[i]);
 
-  sys::Memory::InvalidateInstructionCache(Addr, 12);
-  if (!sys::Memory::setRangeExecutable(Addr, 12))
+  sys::Memory::InvalidateInstructionCache(Addr, 4 * Insts.size());
+  if (!sys::Memory::setRangeExecutable(Addr, 4 * Insts.size()))
     llvm_unreachable("ERROR: Unable to mark stub executable.");
 
   return Addr;
 }
 
+
 TargetJITInfo::LazyResolverFn
 SparcJITInfo::getLazyResolverFunction(JITCompilerFn F) {
   JITCompilerFunction = F;
@@ -159,6 +297,27 @@ void SparcJITInfo::relocate(void *Function, MachineRelocation *MR,
     case SP::reloc_sparc_pc19:
       ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x7ffff;
       break;
+
+    case SP::reloc_sparc_h44:
+      ResultPtr = (ResultPtr >> 22) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_m44:
+      ResultPtr = (ResultPtr >> 12) & 0x3ff;
+      break;
+
+    case SP::reloc_sparc_l44:
+      ResultPtr = (ResultPtr & 0xfff);
+      break;
+
+    case SP::reloc_sparc_hh:
+      ResultPtr = (((int64_t)ResultPtr) >> 42) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_hm:
+      ResultPtr = (((int64_t)ResultPtr) >> 32) & 0x3ff;
+      break;
+
     }
     *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
   }
diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp
new file mode 100644
index 0000000..737e378
--- /dev/null
+++ b/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -0,0 +1,109 @@
+//===-- SparcMCInstLower.cpp - Convert Sparc MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Sparc MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+
+static MCOperand LowerSymbolOperand(const MachineInstr *MI,
+                                    const MachineOperand &MO,
+                                    AsmPrinter &AP) {
+
+  SparcMCExpr::VariantKind Kind =
+    (SparcMCExpr::VariantKind)MO.getTargetFlags();
+  const MCSymbol *Symbol = 0;
+
+  switch(MO.getType()) {
+  default: llvm_unreachable("Unknown type in LowerSymbolOperand");
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AP.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AP.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AP.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AP.GetCPISymbol(MO.getIndex());
+    break;
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol,
+                                                         AP.OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::Create(Kind, MCSym,
+                                                AP.OutContext);
+  return MCOperand::CreateExpr(expr);
+}
+
+static MCOperand LowerOperand(const MachineInstr *MI,
+                              const MachineOperand &MO,
+                              AsmPrinter &AP) {
+  switch(MO.getType()) {
+  default: llvm_unreachable("unknown operand type"); break;
+  case MachineOperand::MO_Register:
+    if (MO.isImplicit())
+      break;
+    return MCOperand::CreateReg(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm());
+
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_ConstantPoolIndex:
+    return LowerSymbolOperand(MI, MO, AP);
+
+  case MachineOperand::MO_RegisterMask:   break;
+
+  }
+  return MCOperand();
+}
+
+void llvm::LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+                                          MCInst &OutMI,
+                                          AsmPrinter &AP)
+{
+
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MI, MO, AP);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index c98613a..f222382 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -35,7 +35,7 @@ ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
                     cl::desc("Reserve application registers (%g2-%g4)"));
 
 SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
-  : SparcGenRegisterInfo(SP::I7), Subtarget(st) {
+  : SparcGenRegisterInfo(SP::O7), Subtarget(st) {
 }
 
 const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 2a575c0..2cadff1 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -16,7 +16,8 @@ class SparcReg<bits<16> Enc, string n> : Register<n> {
   let Namespace = "SP";
 }
 
-class SparcCtrlReg<string n>: Register<n> {
+class SparcCtrlReg<bits<16> Enc, string n>: Register<n> {
+  let HWEncoding = Enc;
   let Namespace = "SP";
 }
 
@@ -49,11 +50,12 @@ class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
 }
 
 // Control Registers
-def ICC : SparcCtrlReg<"ICC">; // This represents icc and xcc in 64-bit code.
-def FCC : SparcCtrlReg<"FCC">;
+def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code.
+foreach I = 0-3 in
+  def FCC#I : SparcCtrlReg<I, "FCC"#I>;
 
 // Y register
-def Y : SparcCtrlReg<"Y">;
+def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
 
 // Integer registers
 def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
@@ -204,3 +206,6 @@ def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
 def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
 
 def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
+
+// Floating point control register classes.
+def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
diff --git a/lib/Target/Sparc/SparcRelocations.h b/lib/Target/Sparc/SparcRelocations.h
index 388cfe7..c1ff78d 100644
--- a/lib/Target/Sparc/SparcRelocations.h
+++ b/lib/Target/Sparc/SparcRelocations.h
@@ -33,7 +33,22 @@ namespace llvm {
       reloc_sparc_pc22 = 4,
 
       // reloc_sparc_pc22 - pc rel. 19 bits for branch with icc/xcc
-      reloc_sparc_pc19 = 5
+      reloc_sparc_pc19 = 5,
+
+      // reloc_sparc_h44 - 43-22 bits
+      reloc_sparc_h44 = 6,
+
+      // reloc_sparc_m44 - 21-12 bits
+      reloc_sparc_m44 = 7,
+
+      // reloc_sparc_l44 - lower 12 bits
+      reloc_sparc_l44 = 8,
+
+      // reloc_sparc_hh - 63-42 bits
+      reloc_sparc_hh  = 9,
+
+      // reloc_sparc_hm - 41-32 bits
+      reloc_sparc_hm  = 10
     };
   }
 }
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 7d09d0e..6fc9d56 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -31,20 +31,20 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   V8DeprecatedInsts(false),
   IsVIS(false),
   Is64Bit(is64Bit),
-  HasHardQuad(false) {
+  HasHardQuad(false),
+  UsePopc(false) {
 
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
-  if (CPUName.empty()) {
-    if (is64Bit)
-      CPUName = "v9";
-    else
-      CPUName = "v8";
-  }
-  IsV9 = CPUName == "v9";
+  if (CPUName.empty())
+    CPUName = (is64Bit) ? "v9" : "v8";
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
+
+  // Popc is a v9-only instruction.
+  if (!IsV9)
+    UsePopc = false;
 }
 
 
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 0f81cc9..4025622 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -27,9 +27,10 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   virtual void anchor();
   bool IsV9;
   bool V8DeprecatedInsts;
-  bool IsVIS;
+  bool IsVIS, IsVIS2, IsVIS3;
   bool Is64Bit;
   bool HasHardQuad;
+  bool UsePopc;
 
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
@@ -37,23 +38,17 @@ public:
 
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
+  bool isVIS2() const { return IsVIS2; }
+  bool isVIS3() const { return IsVIS3; }
   bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
   bool hasHardQuad() const { return HasHardQuad; }
+  bool usePopc() const { return UsePopc; }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   bool is64Bit() const { return Is64Bit; }
-  std::string getDataLayout() const {
-    const char *p;
-    if (is64Bit()) {
-      p = "E-p:64:64:64-i64:64:64-f64:64:64-f128:128:128-n32:64";
-    } else {
-      p = "E-p:32:32:32-i64:64:64-f64:64:64-f128:64:64-n32";
-    }
-    return std::string(p);
-  }
 
   /// The 64-bit ABI uses biased stack and frame pointers, so the stack frame
   /// of the current function is the area from [%sp+BIAS] to [%fp+BIAS].
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 0f93674..83f3474 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -23,6 +23,32 @@ extern "C" void LLVMInitializeSparcTarget() {
   RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target);
 }
 
+static std::string computeDataLayout(const SparcSubtarget &ST) {
+  // Sparc is big endian.
+  std::string Ret = "E-m:e";
+
+  // Some ABIs have 32bit pointers.
+  if (!ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+  if (ST.is64Bit())
+    Ret += "-n32:64";
+  else
+    Ret += "-f128:64-n32";
+
+  if (ST.is64Bit())
+    Ret += "-S128";
+  else
+    Ret += "-S64";
+
+  return Ret;
+}
+
 /// SparcTargetMachine ctor - Create an ILP32 architecture model
 ///
 SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
@@ -33,7 +59,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
                                        bool is64bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
-    DL(Subtarget.getDataLayout()),
+    DL(computeDataLayout(Subtarget)),
     InstrInfo(Subtarget),
     TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
new file mode 100644
index 0000000..f1630e0
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -0,0 +1,43 @@
+//===------- SparcTargetObjectFile.cpp - Sparc Object Info Impl -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetObjectFile.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", Mang, TM);
+
+    // Add information about the stub reference to ELFMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+    if (StubSym.getPointer() == 0) {
+      MCSymbol *Sym = TM.getSymbol(GV, Mang);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    MCContext &Ctx = getContext();
+    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::Create(SSym, Ctx), Ctx);
+  }
+
+  return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h
new file mode 100644
index 0000000..c60675b
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- SparcTargetObjectFile.h - Sparc Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SPARC_TARGETOBJECTFILE_H
+#define LLVM_TARGET_SPARC_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF {
+public:
+  SparcELFTargetObjectFile() :
+    TargetLoweringObjectFileELF()
+  {}
+
+  const MCExpr *
+  getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                          Mangler &Mang, const TargetMachine &TM,
+                          MachineModuleInfo *MMI,
+                          MCStreamer &Streamer) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/SparcTargetStreamer.h b/lib/Target/Sparc/SparcTargetStreamer.h
new file mode 100644
index 0000000..503ebd9
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetStreamer.h
@@ -0,0 +1,49 @@
+//===-- SparcTargetStreamer.h - Sparc Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETSTREAMER_H
+#define SPARCTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class SparcTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  SparcTargetStreamer(MCStreamer &S);
+  /// Emit ".register <reg>, #ignore".
+  virtual void emitSparcRegisterIgnore(unsigned reg) = 0;
+  /// Emit ".register <reg>, #scratch".
+  virtual void emitSparcRegisterScratch(unsigned reg) = 0;
+};
+
+// This part is for ascii assembly output
+class SparcTargetAsmStreamer : public SparcTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  SparcTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  virtual void emitSparcRegisterIgnore(unsigned reg);
+  virtual void emitSparcRegisterScratch(unsigned reg);
+
+};
+
+// This part is for ELF object output
+class SparcTargetELFStreamer : public SparcTargetStreamer {
+public:
+  SparcTargetELFStreamer(MCStreamer &S);
+  MCELFStreamer &getStreamer();
+  virtual void emitSparcRegisterIgnore(unsigned reg) {}
+  virtual void emitSparcRegisterScratch(unsigned reg) {}
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Sparc/TargetInfo/CMakeLists.txt b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
index b0d031e..9633e03 100644
--- a/lib/Target/Sparc/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMSparcInfo
   SparcTargetInfo.cpp
   )
-
-add_dependencies(LLVMSparcInfo SparcCommonTableGen)
diff --git a/lib/Target/Sparc/TargetInfo/LLVMBuild.txt b/lib/Target/Sparc/TargetInfo/LLVMBuild.txt
index b5c320f..e992d3e 100644
--- a/lib/Target/Sparc/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Sparc/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SparcInfo
 parent = Sparc
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = Sparc
diff --git a/lib/Target/SystemZ/AsmParser/CMakeLists.txt b/lib/Target/SystemZ/AsmParser/CMakeLists.txt
index 78a5714..ad19a56 100644
--- a/lib/Target/SystemZ/AsmParser/CMakeLists.txt
+++ b/lib/Target/SystemZ/AsmParser/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMSystemZAsmParser
   SystemZAsmParser.cpp
   )
-
-add_dependencies(LLVMSystemZAsmParser SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 763f40c..a3dd4b6 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 // Return true if Expr is in the range [MinValue, MaxValue].
 static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
-  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+  if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
     int64_t Value = CE->getValue();
     return Value >= MinValue && Value <= MaxValue;
   }
@@ -112,7 +112,7 @@ private:
     // Add as immediates when possible.  Null MCExpr = 0.
     if (Expr == 0)
       Inst.addOperand(MCOperand::CreateImm(0));
-    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+    else if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
     else
       Inst.addOperand(MCOperand::CreateExpr(Expr));
@@ -162,7 +162,7 @@ public:
   }
 
   // Token operands
-  virtual bool isToken() const LLVM_OVERRIDE {
+  bool isToken() const override {
     return Kind == KindToken;
   }
   StringRef getToken() const {
@@ -171,13 +171,13 @@ public:
   }
 
   // Register operands.
-  virtual bool isReg() const LLVM_OVERRIDE {
+  bool isReg() const override {
     return Kind == KindReg;
   }
   bool isReg(RegisterKind RegKind) const {
     return Kind == KindReg && Reg.Kind == RegKind;
   }
-  virtual unsigned getReg() const LLVM_OVERRIDE {
+  unsigned getReg() const override {
     assert(Kind == KindReg && "Not a register");
     return Reg.Num;
   }
@@ -189,7 +189,7 @@ public:
   }
 
   // Immediate operands.
-  virtual bool isImm() const LLVM_OVERRIDE {
+  bool isImm() const override {
     return Kind == KindImm;
   }
   bool isImm(int64_t MinValue, int64_t MaxValue) const {
@@ -201,7 +201,7 @@ public:
   }
 
   // Memory operands.
-  virtual bool isMem() const LLVM_OVERRIDE {
+  bool isMem() const override {
     return Kind == KindMem;
   }
   bool isMem(RegisterKind RegKind, MemoryKind MemKind) const {
@@ -221,9 +221,9 @@ public:
   }
 
   // Override MCParsedAsmOperand.
-  virtual SMLoc getStartLoc() const LLVM_OVERRIDE { return StartLoc; }
-  virtual SMLoc getEndLoc() const LLVM_OVERRIDE { return EndLoc; }
-  virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
+  SMLoc getStartLoc() const override { return StartLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
+  void print(raw_ostream &OS) const override;
 
   // Used by the TableGen code to add particular types of operand
   // to an instruction.
@@ -340,18 +340,16 @@ public:
   }
 
   // Override MCTargetAsmParser.
-  virtual bool ParseDirective(AsmToken DirectiveID) LLVM_OVERRIDE;
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                             SMLoc &EndLoc) LLVM_OVERRIDE;
-  virtual bool ParseInstruction(ParseInstructionInfo &Info,
-                                StringRef Name, SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-    LLVM_OVERRIDE;
-  virtual bool
-    MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                            SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                            MCStreamer &Out, unsigned &ErrorInfo,
-                            bool MatchingInlineAsm) LLVM_OVERRIDE;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info,
+                        StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands)
+    override;
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
 
   // Used by the TableGen code to parse particular operand types.
   OperandMatchResultTy
@@ -428,7 +426,7 @@ public:
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
   }
 };
-}
+} // end anonymous namespace
 
 #define GET_REGISTER_MATCHER
 #define GET_SUBTARGET_FEATURE_NAME
@@ -715,7 +713,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   default: break;
   case Match_Success:
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst);
+    Out.EmitInstruction(Inst, STI);
     return false;
 
   case Match_MissingFeature: {
@@ -781,7 +779,7 @@ parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
 
   // For consistency with the GNU assembler, treat immediates as offsets
   // from ".".
-  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+  if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
     int64_t Value = CE->getValue();
     if ((Value & 1) || Value < MinVal || Value > MaxVal) {
       Error(StartLoc, "offset out of range");
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index d21c0a8..4da2d0f 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -30,8 +30,6 @@ add_llvm_target(SystemZCodeGen
   SystemZTargetMachine.cpp
   )
 
-add_dependencies(LLVMSystemZCodeGen SystemZCommonTableGen intrinsics_gen)
-
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/SystemZ/Disassembler/CMakeLists.txt b/lib/Target/SystemZ/Disassembler/CMakeLists.txt
index 5bc1859..4b94bcd 100644
--- a/lib/Target/SystemZ/Disassembler/CMakeLists.txt
+++ b/lib/Target/SystemZ/Disassembler/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMSystemZDisassembler
   SystemZDisassembler.cpp
   )
-
-add_dependencies(LLVMSystemZDisassembler SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index fc3c38d..59a1fe9 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -27,12 +27,10 @@ public:
   virtual ~SystemZDisassembler() {}
 
   // Override MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const LLVM_OVERRIDE;
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 } // end anonymous namespace
 
diff --git a/lib/Target/SystemZ/InstPrinter/CMakeLists.txt b/lib/Target/SystemZ/InstPrinter/CMakeLists.txt
index ddbf82f..21b023c 100644
--- a/lib/Target/SystemZ/InstPrinter/CMakeLists.txt
+++ b/lib/Target/SystemZ/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMSystemZAsmPrinter
   SystemZInstPrinter.cpp
   )
-
-add_dependencies(LLVMSystemZAsmPrinter SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 734ecf0..dce482b 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -38,10 +38,8 @@ public:
   static void printOperand(const MCOperand &MO, raw_ostream &O);
 
   // Override MCInstPrinter.
-  virtual void printRegName(raw_ostream &O, unsigned RegNo) const
-    LLVM_OVERRIDE;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot)
-    LLVM_OVERRIDE;
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
 private:
   // Print various types of operand.
diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
index 95e657f..7781318 100644
--- a/lib/Target/SystemZ/LLVMBuild.txt
+++ b/lib/Target/SystemZ/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = SystemZCodeGen
 parent = SystemZ
-required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SystemZDesc SystemZInfo Support Target
+required_libraries = AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
index 3d13128..1aa8c76 100644
--- a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
@@ -5,5 +5,3 @@ add_llvm_library(LLVMSystemZDesc
   SystemZMCObjectWriter.cpp
   SystemZMCTargetDesc.cpp
   )
-
-add_dependencies(LLVMSystemZDesc SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 26a8fae..6e7268d 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -43,37 +43,27 @@ public:
     : OSABI(osABI) {}
 
   // Override MCAsmBackend
-  virtual unsigned getNumFixupKinds() const LLVM_OVERRIDE {
+  unsigned getNumFixupKinds() const override {
     return SystemZ::NumTargetFixupKinds;
   }
-  virtual const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const
-    LLVM_OVERRIDE;
-  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                          uint64_t Value) const LLVM_OVERRIDE;
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
-  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                                    uint64_t Value,
-                                    const MCRelaxableFragment *Fragment,
-                                    const MCAsmLayout &Layout) const
-    LLVM_OVERRIDE {
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *Fragment,
+                            const MCAsmLayout &Layout) const override {
     return false;
   }
-  virtual void relaxInstruction(const MCInst &Inst,
-                                MCInst &Res) const LLVM_OVERRIDE {
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
     llvm_unreachable("SystemZ does do not have assembler relaxation");
   }
-  virtual bool writeNopData(uint64_t Count,
-                            MCObjectWriter *OW) const LLVM_OVERRIDE;
-  virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const
-    LLVM_OVERRIDE {
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createSystemZObjectWriter(OS, OSABI);
   }
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const
-    LLVM_OVERRIDE {
-    return false;
-  }
 };
 } // end anonymous namespace
 
@@ -95,7 +85,8 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 }
 
 void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value) const {
+                                     unsigned DataSize, uint64_t Value,
+                                     bool IsPCRel) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned Size = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 965c41e..c46a36b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -19,8 +19,6 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
 
   CommentString = "#";
-  GlobalPrefix = "";
-  PrivateGlobalPrefix = ".L";
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = "\t.quad\t";
   UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index b9ac92a..1de97af 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -21,10 +21,9 @@ public:
   explicit SystemZMCAsmInfo(StringRef TT);
 
   // Override MCAsmInfo;
-  virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const
-    LLVM_OVERRIDE;
+  const MCSection *getNonexecutableStackSection(MCContext &Ctx) const override;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index f07ea7b..df50863 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -34,34 +34,41 @@ public:
   ~SystemZMCCodeEmitter() {}
 
   // OVerride MCCodeEmitter.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                 SmallVectorImpl<MCFixup> &Fixups) const
-    LLVM_OVERRIDE;
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
 
 private:
   // Automatically generated by TableGen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups) const;
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
   // Called by the TableGen code to get the binary encoding of operand
   // MO in MI.  Fixups is the list of fixups against MI.
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups) const;
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
   // Called by the TableGen code to get the binary encoding of an address.
   // The index or length, if any, is encoded first, followed by the base,
   // followed by the displacement.  In a 20-bit displacement,
   // the low 12 bits are encoded before the high 8 bits.
   uint64_t getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
   uint64_t getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const;
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
   uint64_t getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
   uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
   uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
-                                    SmallVectorImpl<MCFixup> &Fixups) const;
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
   // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
   // Offset bytes from the start of MI.  Add the fixup to Fixups
@@ -72,15 +79,17 @@ private:
                             unsigned Kind, int64_t Offset) const;
 
   uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
-                              SmallVectorImpl<MCFixup> &Fixups) const {
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
     return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC16DBL, 2);
   }
   uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
-                              SmallVectorImpl<MCFixup> &Fixups) const {
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
     return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
   }
 };
-}
+} // end anonymous namespace
 
 MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
                                                 const MCRegisterInfo &MRI,
@@ -91,8 +100,9 @@ MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
 
 void SystemZMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
   unsigned Size = MCII.get(MI.getOpcode()).getSize();
   // Big-endian insertion of Size bytes.
   unsigned ShiftValue = (Size * 8) - 8;
@@ -104,7 +114,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
 uint64_t SystemZMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   if (MO.isReg())
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
   if (MO.isImm())
@@ -114,38 +125,42 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 
 uint64_t SystemZMCCodeEmitter::
 getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
   assert(isUInt<4>(Base) && isUInt<12>(Disp));
   return (Base << 12) | Disp;
 }
 
 uint64_t SystemZMCCodeEmitter::
 getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
-                    SmallVectorImpl<MCFixup> &Fixups) const {
-  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
   assert(isUInt<4>(Base) && isInt<20>(Disp));
   return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
 }
 
 uint64_t SystemZMCCodeEmitter::
 getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
-  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
-  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups);
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
   return (Index << 16) | (Base << 12) | Disp;
 }
 
 uint64_t SystemZMCCodeEmitter::
 getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
-                     SmallVectorImpl<MCFixup> &Fixups) const {
-  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
-  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups);
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
   assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
   return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
     | ((Disp & 0xff000) >> 12);
@@ -153,10 +168,11 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
 
 uint64_t SystemZMCCodeEmitter::
 getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups) const {
-  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
-  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups) - 1;
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<8>(Len));
   return (Len << 16) | (Base << 12) | Disp;
 }
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
index 9c94ebb..a3aab71 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -14,18 +14,18 @@
 
 namespace llvm {
 namespace SystemZ {
-  enum FixupKind {
-    // These correspond directly to R_390_* relocations.
-    FK_390_PC16DBL = FirstTargetFixupKind,
-    FK_390_PC32DBL,
-    FK_390_PLT16DBL,
-    FK_390_PLT32DBL,
+enum FixupKind {
+  // These correspond directly to R_390_* relocations.
+  FK_390_PC16DBL = FirstTargetFixupKind,
+  FK_390_PC32DBL,
+  FK_390_PLT16DBL,
+  FK_390_PLT32DBL,
 
-    // Marker
-    LastTargetFixupKind,
-    NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-  };
-}
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace SystemZ
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 36e3d83..54c6987 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -24,16 +24,10 @@ public:
 
 protected:
   // Override MCELFObjectTargetWriter.
-  virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                bool IsPCRel, bool IsRelocWithSymbol,
-                                int64_t Addend) const LLVM_OVERRIDE;
-  virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
-                                         const MCValue &Target,
-                                         const MCFragment &F,
-                                         const MCFixup &Fixup,
-                                         bool IsPCRel) const LLVM_OVERRIDE;
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
 };
-} // end anonymouse namespace
+} // end anonymous namespace
 
 SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
   : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
@@ -87,12 +81,8 @@ static unsigned getPLTReloc(unsigned Kind) {
 
 unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
                                            const MCFixup &Fixup,
-                                           bool IsPCRel,
-                                           bool IsRelocWithSymbol,
-                                           int64_t Addend) const {
-  MCSymbolRefExpr::VariantKind Modifier = (Target.isAbsolute() ?
-                                           MCSymbolRefExpr::VK_None :
-                                           Target.getSymA()->getKind());
+                                           bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
   unsigned Kind = Fixup.getKind();
   switch (Modifier) {
   case MCSymbolRefExpr::VK_None:
@@ -118,21 +108,6 @@ unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
   }
 }
 
-const MCSymbol *SystemZObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
-                                                    const MCValue &Target,
-                                                    const MCFragment &F,
-                                                    const MCFixup &Fixup,
-                                                    bool IsPCRel) const {
-  // The addend in a PC-relative R_390_* relocation is always applied to
-  // the PC-relative part of the address.  If some kind of indirection
-  // is applied to the symbol first, we can't use an addend there too.
-  if (!Target.isAbsolute() &&
-      Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None &&
-      IsPCRel)
-    return &Target.getSymA()->getSymbol().AliasedSymbol();
-  return NULL;
-}
-
 MCObjectWriter *llvm::createSystemZObjectWriter(raw_ostream &OS,
                                                 uint8_t OSABI) {
   MCELFObjectTargetWriter *MOTW = new SystemZObjectWriter(OSABI);
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 9e1296b..8d1bac9 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -185,9 +185,10 @@ static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
                                                  MCAsmBackend &MAB,
                                                  raw_ostream &OS,
                                                  MCCodeEmitter *Emitter,
+                                                 const MCSubtargetInfo &STI,
                                                  bool RelaxAll,
                                                  bool NoExecStack) {
-  return createELFStreamer(Ctx, 0, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeSystemZTargetMC() {
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 97e325b..cbaf9a8 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -28,47 +28,47 @@ class raw_ostream;
 extern Target TheSystemZTarget;
 
 namespace SystemZMC {
-  // How many bytes are in the ABI-defined, caller-allocated part of
-  // a stack frame.
-  const int64_t CallFrameSize = 160;
-
-  // The offset of the DWARF CFA from the incoming stack pointer.
-  const int64_t CFAOffsetFromInitialSP = CallFrameSize;
-
-  // Maps of asm register numbers to LLVM register numbers, with 0 indicating
-  // an invalid register.  In principle we could use 32-bit and 64-bit register
-  // classes directly, provided that we relegated the GPR allocation order
-  // in SystemZRegisterInfo.td to an AltOrder and left the default order
-  // as %r0-%r15.  It seems better to provide the same interface for
-  // all classes though.
-  extern const unsigned GR32Regs[16];
-  extern const unsigned GRH32Regs[16];
-  extern const unsigned GR64Regs[16];
-  extern const unsigned GR128Regs[16];
-  extern const unsigned FP32Regs[16];
-  extern const unsigned FP64Regs[16];
-  extern const unsigned FP128Regs[16];
-
-  // Return the 0-based number of the first architectural register that
-  // contains the given LLVM register.   E.g. R1D -> 1.
-  unsigned getFirstReg(unsigned Reg);
-
-  // Return the given register as a GR64.
-  inline unsigned getRegAsGR64(unsigned Reg) {
-    return GR64Regs[getFirstReg(Reg)];
-  }
-
-  // Return the given register as a low GR32.
-  inline unsigned getRegAsGR32(unsigned Reg) {
-    return GR32Regs[getFirstReg(Reg)];
-  }
-
-  // Return the given register as a high GR32.
-  inline unsigned getRegAsGRH32(unsigned Reg) {
-    return GRH32Regs[getFirstReg(Reg)];
-  }
+// How many bytes are in the ABI-defined, caller-allocated part of
+// a stack frame.
+const int64_t CallFrameSize = 160;
+
+// The offset of the DWARF CFA from the incoming stack pointer.
+const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+
+// Maps of asm register numbers to LLVM register numbers, with 0 indicating
+// an invalid register.  In principle we could use 32-bit and 64-bit register
+// classes directly, provided that we relegated the GPR allocation order
+// in SystemZRegisterInfo.td to an AltOrder and left the default order
+// as %r0-%r15.  It seems better to provide the same interface for
+// all classes though.
+extern const unsigned GR32Regs[16];
+extern const unsigned GRH32Regs[16];
+extern const unsigned GR64Regs[16];
+extern const unsigned GR128Regs[16];
+extern const unsigned FP32Regs[16];
+extern const unsigned FP64Regs[16];
+extern const unsigned FP128Regs[16];
+
+// Return the 0-based number of the first architectural register that
+// contains the given LLVM register.   E.g. R1D -> 1.
+unsigned getFirstReg(unsigned Reg);
+
+// Return the given register as a GR64.
+inline unsigned getRegAsGR64(unsigned Reg) {
+  return GR64Regs[getFirstReg(Reg)];
 }
 
+// Return the given register as a low GR32.
+inline unsigned getRegAsGR32(unsigned Reg) {
+  return GR32Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a high GR32.
+inline unsigned getRegAsGRH32(unsigned Reg) {
+  return GRH32Regs[getFirstReg(Reg)];
+}
+} // end namespace SystemZMC
+
 MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCRegisterInfo &MRI,
                                           const MCSubtargetInfo &STI,
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index afa6cf0..e089047 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -166,13 +166,6 @@ See CodeGen/SystemZ/alloca-01.ll for an example.
 
 --
 
-Atomic loads and stores use the default compare-and-swap based implementation.
-This is much too conservative in practice, since the architecture guarantees
-that 1-, 2-, 4- and 8-byte loads and stores to aligned addresses are
-inherently atomic.
-
---
-
 If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
 
 --
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index dcebbad..1579249 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -19,97 +19,98 @@
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
-  class SystemZTargetMachine;
-  class FunctionPass;
-
-  namespace SystemZ {
-    // Condition-code mask values.
-    const unsigned CCMASK_0 = 1 << 3;
-    const unsigned CCMASK_1 = 1 << 2;
-    const unsigned CCMASK_2 = 1 << 1;
-    const unsigned CCMASK_3 = 1 << 0;
-    const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
-
-    // Condition-code mask assignments for integer and floating-point
-    // comparisons.
-    const unsigned CCMASK_CMP_EQ = CCMASK_0;
-    const unsigned CCMASK_CMP_LT = CCMASK_1;
-    const unsigned CCMASK_CMP_GT = CCMASK_2;
-    const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
-    const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
-    const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
-
-    // Condition-code mask assignments for floating-point comparisons only.
-    const unsigned CCMASK_CMP_UO = CCMASK_3;
-    const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
-
-    // All condition-code values produced by comparisons.
-    const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
-    const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
-
-    // Condition-code mask assignments for CS.
-    const unsigned CCMASK_CS_EQ = CCMASK_0;
-    const unsigned CCMASK_CS_NE = CCMASK_1;
-    const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
-
-    // Condition-code mask assignments for a completed SRST loop.
-    const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
-    const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
-    const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
-
-    // Condition-code mask assignments for TEST UNDER MASK.
-    const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
-    const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
-    const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
-    const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
-    const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
-    const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
-    const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
-    const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
-    const unsigned CCMASK_TM             = CCMASK_ANY;
-
-    // The position of the low CC bit in an IPM result.
-    const unsigned IPM_CC = 28;
-
-    // Mask assignments for PFD.
-    const unsigned PFD_READ  = 1;
-    const unsigned PFD_WRITE = 2;
-
-    // Return true if Val fits an LLILL operand.
-    static inline bool isImmLL(uint64_t Val) {
-      return (Val & ~0x000000000000ffffULL) == 0;
-    }
-
-    // Return true if Val fits an LLILH operand.
-    static inline bool isImmLH(uint64_t Val) {
-      return (Val & ~0x00000000ffff0000ULL) == 0;
-    }
-
-    // Return true if Val fits an LLIHL operand.
-    static inline bool isImmHL(uint64_t Val) {
-      return (Val & ~0x00000ffff00000000ULL) == 0;
-    }
-
-    // Return true if Val fits an LLIHH operand.
-    static inline bool isImmHH(uint64_t Val) {
-      return (Val & ~0xffff000000000000ULL) == 0;
-    }
-
-    // Return true if Val fits an LLILF operand.
-    static inline bool isImmLF(uint64_t Val) {
-      return (Val & ~0x00000000ffffffffULL) == 0;
-    }
-
-    // Return true if Val fits an LLIHF operand.
-    static inline bool isImmHF(uint64_t Val) {
-      return (Val & ~0xffffffff00000000ULL) == 0;
-    }
-  }
-
-  FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
-                                     CodeGenOpt::Level OptLevel);
-  FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
-  FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
-  FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
-} // end namespace llvm;
+class SystemZTargetMachine;
+class FunctionPass;
+
+namespace SystemZ {
+// Condition-code mask values.
+const unsigned CCMASK_0 = 1 << 3;
+const unsigned CCMASK_1 = 1 << 2;
+const unsigned CCMASK_2 = 1 << 1;
+const unsigned CCMASK_3 = 1 << 0;
+const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for integer and floating-point
+// comparisons.
+const unsigned CCMASK_CMP_EQ = CCMASK_0;
+const unsigned CCMASK_CMP_LT = CCMASK_1;
+const unsigned CCMASK_CMP_GT = CCMASK_2;
+const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
+const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
+const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
+
+// Condition-code mask assignments for floating-point comparisons only.
+const unsigned CCMASK_CMP_UO = CCMASK_3;
+const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
+
+// All condition-code values produced by comparisons.
+const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
+const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for CS.
+const unsigned CCMASK_CS_EQ = CCMASK_0;
+const unsigned CCMASK_CS_NE = CCMASK_1;
+const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
+
+// Condition-code mask assignments for a completed SRST loop.
+const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
+const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
+
+// Condition-code mask assignments for TEST UNDER MASK.
+const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
+const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
+const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
+const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+const unsigned CCMASK_TM             = CCMASK_ANY;
+
+// The position of the low CC bit in an IPM result.
+const unsigned IPM_CC = 28;
+
+// Mask assignments for PFD.
+const unsigned PFD_READ  = 1;
+const unsigned PFD_WRITE = 2;
+
+// Return true if Val fits an LLILL operand.
+static inline bool isImmLL(uint64_t Val) {
+  return (Val & ~0x000000000000ffffULL) == 0;
+}
+
+// Return true if Val fits an LLILH operand.
+static inline bool isImmLH(uint64_t Val) {
+  return (Val & ~0x00000000ffff0000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHL operand.
+static inline bool isImmHL(uint64_t Val) {
+  return (Val & ~0x00000ffff00000000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHH operand.
+static inline bool isImmHH(uint64_t Val) {
+  return (Val & ~0xffff000000000000ULL) == 0;
+}
+
+// Return true if Val fits an LLILF operand.
+static inline bool isImmLF(uint64_t Val) {
+  return (Val & ~0x00000000ffffffffULL) == 0;
+}
+
+// Return true if Val fits an LLIHF operand.
+static inline bool isImmHF(uint64_t Val) {
+  return (Val & ~0xffffffff00000000ULL) == 0;
+}
+} // end namespace SystemZ
+
+FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+} // end namespace llvm
+
 #endif
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index abf5c8e..5f82903 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -53,20 +53,10 @@ def SystemZAsmParser : AsmParser {
 }
 
 //===----------------------------------------------------------------------===//
-// Assembly writer
-//===----------------------------------------------------------------------===//
-
-def SystemZAsmWriter : AsmWriter {
-  string AsmWriterClassName = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
 // Top-level target declaration
 //===----------------------------------------------------------------------===//
 
 def SystemZ : Target {
   let InstructionSet = SystemZInstrInfo;
   let AssemblyParsers = [SystemZAsmParser];
-  let AssemblyWriters = [SystemZAsmWriter];
 }
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 75cbda4..8b18bc1 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -18,11 +18,11 @@
 #include "SystemZMCInstLower.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
@@ -151,11 +151,20 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
 #undef LOWER_HIGH
 
+  case SystemZ::Serialize:
+    if (Subtarget->hasFastSerialization())
+      LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
+        .addImm(14).addReg(SystemZ::R0D);
+    else
+      LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
+        .addImm(15).addReg(SystemZ::R0D);
+    break;
+
   default:
     Lower.lower(MI, LoweredMI);
     break;
   }
-  OutStreamer.EmitInstruction(LoweredMI);
+  EmitToStreamer(OutStreamer, LoweredMI);
 }
 
 // Convert a SystemZ-specific constant pool modifier into the associated
@@ -170,8 +179,7 @@ getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
 
 void SystemZAsmPrinter::
 EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
-  SystemZConstantPoolValue *ZCPV =
-    static_cast<SystemZConstantPoolValue*>(MCPV);
+  auto *ZCPV = static_cast<SystemZConstantPoolValue*>(MCPV);
 
   const MCExpr *Expr =
     MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
@@ -212,7 +220,7 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetELF()) {
-    const TargetLoweringObjectFileELF &TLOFELF =
+    auto &TLOFELF =
       static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
 
     MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index 4b6c51b..20093bc 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -32,20 +32,18 @@ public:
   }
 
   // Override AsmPrinter.
-  virtual const char *getPassName() const LLVM_OVERRIDE {
+  const char *getPassName() const override {
     return "SystemZ Assembly Printer";
   }
-  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
-  virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV)
-    LLVM_OVERRIDE;
-  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                               unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &OS) LLVM_OVERRIDE;
-  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                                     unsigned AsmVariant,
-                                     const char *ExtraCode,
-                                     raw_ostream &OS) LLVM_OVERRIDE;
-  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
+  void EmitInstruction(const MachineInstr *MI) override;
+  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+  void EmitEndOfAsmFile(Module &M) override;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h
index 298985e..4b1569d 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/lib/Target/SystemZ/SystemZCallingConv.h
@@ -11,13 +11,13 @@
 #define SYSTEMZCALLINGCONV_H
 
 namespace llvm {
-  namespace SystemZ {
-    const unsigned NumArgGPRs = 5;
-    extern const unsigned ArgGPRs[NumArgGPRs];
+namespace SystemZ {
+  const unsigned NumArgGPRs = 5;
+  extern const unsigned ArgGPRs[NumArgGPRs];
 
-    const unsigned NumArgFPRs = 4;
-    extern const unsigned ArgFPRs[NumArgFPRs];
-  }
-}
+  const unsigned NumArgFPRs = 4;
+  extern const unsigned ArgFPRs[NumArgFPRs];
+} // end namespace SystemZ
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index 6c70811..19cec21 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -43,7 +43,7 @@ getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
   for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
     if (Constants[I].isMachineConstantPoolEntry() &&
         (Constants[I].getAlignment() & AlignMask) == 0) {
-      SystemZConstantPoolValue *ZCPV =
+      auto *ZCPV =
         static_cast<SystemZConstantPoolValue *>(Constants[I].Val.MachineCPVal);
       if (ZCPV->GV == GV && ZCPV->Modifier == Modifier)
         return I;
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h
index 9927bdb..699718f 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -18,10 +18,10 @@ namespace llvm {
 class GlobalValue;
 
 namespace SystemZCP {
-  enum SystemZCPModifier {
-    NTPOFF
-  };
-}
+enum SystemZCPModifier {
+  NTPOFF
+};
+} // end namespace SystemZCP
 
 /// A SystemZ-specific constant pool value.  At present, the only
 /// defined constant pool values are offsets of thread-local variables
@@ -39,17 +39,17 @@ public:
     Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
 
   // Override MachineConstantPoolValue.
-  virtual unsigned getRelocationInfo() const LLVM_OVERRIDE;
-  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
-                                        unsigned Alignment) LLVM_OVERRIDE;
-  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID) LLVM_OVERRIDE;
-  virtual void print(raw_ostream &O) const LLVM_OVERRIDE;
+  unsigned getRelocationInfo() const override;
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
 
   // Access SystemZ-specific fields.
   const GlobalValue *getGlobalValue() const { return GV; }
   SystemZCP::SystemZCPModifier getModifier() const { return Modifier; }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index b8a77db..fdf80a9 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -33,73 +33,72 @@ STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
 STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
 
 namespace {
-  // Represents the references to a particular register in one or more
-  // instructions.
-  struct Reference {
-    Reference()
-      : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
-
-    Reference &operator|=(const Reference &Other) {
-      Def |= Other.Def;
-      IndirectDef |= Other.IndirectDef;
-      Use |= Other.Use;
-      IndirectUse |= Other.IndirectUse;
-      return *this;
-    }
+// Represents the references to a particular register in one or more
+// instructions.
+struct Reference {
+  Reference()
+    : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
+
+  Reference &operator|=(const Reference &Other) {
+    Def |= Other.Def;
+    IndirectDef |= Other.IndirectDef;
+    Use |= Other.Use;
+    IndirectUse |= Other.IndirectUse;
+    return *this;
+  }
 
-    operator bool() const { return Def || Use; }
+  operator bool() const { return Def || Use; }
 
-    // True if the register is defined or used in some form, either directly or
-    // via a sub- or super-register.
-    bool Def;
-    bool Use;
+  // True if the register is defined or used in some form, either directly or
+  // via a sub- or super-register.
+  bool Def;
+  bool Use;
 
-    // True if the register is defined or used indirectly, by a sub- or
-    // super-register.
-    bool IndirectDef;
-    bool IndirectUse;
-  };
+  // True if the register is defined or used indirectly, by a sub- or
+  // super-register.
+  bool IndirectDef;
+  bool IndirectUse;
+};
 
-  class SystemZElimCompare : public MachineFunctionPass {
-  public:
-    static char ID;
-    SystemZElimCompare(const SystemZTargetMachine &tm)
-      : MachineFunctionPass(ID), TII(0), TRI(0) {}
+class SystemZElimCompare : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZElimCompare(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(0), TRI(0) {}
 
-    virtual const char *getPassName() const {
-      return "SystemZ Comparison Elimination";
-    }
+  const char *getPassName() const override {
+    return "SystemZ Comparison Elimination";
+  }
+
+  bool processBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F);
 
-    bool processBlock(MachineBasicBlock *MBB);
-    bool runOnMachineFunction(MachineFunction &F);
-
-  private:
-    Reference getRegReferences(MachineInstr *MI, unsigned Reg);
-    bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
-                       SmallVectorImpl<MachineInstr *> &CCUsers);
-    bool convertToLoadAndTest(MachineInstr *MI);
-    bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
-                               SmallVectorImpl<MachineInstr *> &CCUsers);
-    bool optimizeCompareZero(MachineInstr *Compare,
+private:
+  Reference getRegReferences(MachineInstr *MI, unsigned Reg);
+  bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+                     SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool convertToLoadAndTest(MachineInstr *MI);
+  bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
                              SmallVectorImpl<MachineInstr *> &CCUsers);
-    bool fuseCompareAndBranch(MachineInstr *Compare,
-                              SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool optimizeCompareZero(MachineInstr *Compare,
+                           SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool fuseCompareAndBranch(MachineInstr *Compare,
+                            SmallVectorImpl<MachineInstr *> &CCUsers);
 
-    const SystemZInstrInfo *TII;
-    const TargetRegisterInfo *TRI;
-  };
+  const SystemZInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+};
 
-  char SystemZElimCompare::ID = 0;
-} // end of anonymous namespace
+char SystemZElimCompare::ID = 0;
+} // end anonymous namespace
 
 FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
   return new SystemZElimCompare(TM);
 }
 
 // Return true if CC is live out of MBB.
-static bool isCCLiveOut(MachineBasicBlock *MBB) {
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI)
+static bool isCCLiveOut(MachineBasicBlock &MBB) {
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
     if ((*SI)->isLiveIn(SystemZ::CC))
       return true;
   return false;
@@ -328,8 +327,8 @@ optimizeCompareZero(MachineInstr *Compare,
   // Search back for CC results that are based on the first operand.
   unsigned SrcReg = Compare->getOperand(0).getReg();
   unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
-  MachineBasicBlock *MBB = Compare->getParent();
-  MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB->begin();
+  MachineBasicBlock &MBB = *Compare->getParent();
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
   Reference CCRefs;
   Reference SrcRefs;
   while (MBBI != MBBE) {
@@ -424,7 +423,7 @@ fuseCompareAndBranch(MachineInstr *Compare,
 
 // Process all comparison instructions in MBB.  Return true if something
 // changed.
-bool SystemZElimCompare::processBlock(MachineBasicBlock *MBB) {
+bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
   // Walk backwards through the block looking for comparisons, recording
@@ -432,8 +431,8 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock *MBB) {
   // instructions before it.
   bool CompleteCCUsers = !isCCLiveOut(MBB);
   SmallVector<MachineInstr *, 4> CCUsers;
-  MachineBasicBlock::iterator MBBI = MBB->end();
-  while (MBBI != MBB->begin()) {
+  MachineBasicBlock::iterator MBBI = MBB.end();
+  while (MBBI != MBB.begin()) {
     MachineInstr *MI = --MBBI;
     if (CompleteCCUsers &&
         MI->isCompare() &&
@@ -463,9 +462,8 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
   TRI = &TII->getRegisterInfo();
 
   bool Changed = false;
-  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
-       MFI != MFE; ++MFI)
-    Changed |= processBlock(MFI);
+  for (auto &MBB : F)
+    Changed |= processBlock(MBB);
 
   return Changed;
 }
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index acfb491..c856955 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -20,29 +20,29 @@
 using namespace llvm;
 
 namespace {
-  // The ABI-defined register save slots, relative to the incoming stack
-  // pointer.
-  static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
-    { SystemZ::R2D,  0x10 },
-    { SystemZ::R3D,  0x18 },
-    { SystemZ::R4D,  0x20 },
-    { SystemZ::R5D,  0x28 },
-    { SystemZ::R6D,  0x30 },
-    { SystemZ::R7D,  0x38 },
-    { SystemZ::R8D,  0x40 },
-    { SystemZ::R9D,  0x48 },
-    { SystemZ::R10D, 0x50 },
-    { SystemZ::R11D, 0x58 },
-    { SystemZ::R12D, 0x60 },
-    { SystemZ::R13D, 0x68 },
-    { SystemZ::R14D, 0x70 },
-    { SystemZ::R15D, 0x78 },
-    { SystemZ::F0D,  0x80 },
-    { SystemZ::F2D,  0x88 },
-    { SystemZ::F4D,  0x90 },
-    { SystemZ::F6D,  0x98 }
-  };
-}
+// The ABI-defined register save slots, relative to the incoming stack
+// pointer.
+static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
+  { SystemZ::R2D,  0x10 },
+  { SystemZ::R3D,  0x18 },
+  { SystemZ::R4D,  0x20 },
+  { SystemZ::R5D,  0x28 },
+  { SystemZ::R6D,  0x30 },
+  { SystemZ::R7D,  0x38 },
+  { SystemZ::R8D,  0x40 },
+  { SystemZ::R9D,  0x48 },
+  { SystemZ::R10D, 0x50 },
+  { SystemZ::R11D, 0x58 },
+  { SystemZ::R12D, 0x60 },
+  { SystemZ::R13D, 0x68 },
+  { SystemZ::R14D, 0x70 },
+  { SystemZ::R15D, 0x78 },
+  { SystemZ::F0D,  0x80 },
+  { SystemZ::F2D,  0x88 },
+  { SystemZ::F4D,  0x90 },
+  { SystemZ::F6D,  0x98 }
+};
+} // end anonymous namespace
 
 SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm,
                                            const SystemZSubtarget &sti)
@@ -312,7 +312,7 @@ static void emitIncrement(MachineBasicBlock &MBB,
 void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFFrame = MF.getFrameInfo();
-  const SystemZInstrInfo *ZII =
+  auto *ZII =
     static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -333,16 +333,14 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
       llvm_unreachable("Couldn't skip over GPR saves");
 
     // Add CFI for the GPR saves.
-    MCSymbol *GPRSaveLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL,
-            ZII->get(TargetOpcode::PROLOG_LABEL)).addSym(GPRSaveLabel);
-    for (std::vector<CalleeSavedInfo>::const_iterator
-           I = CSI.begin(), E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
+    for (auto &Save : CSI) {
+      unsigned Reg = Save.getReg();
       if (SystemZ::GR64BitRegClass.contains(Reg)) {
         int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
-        MMI.addFrameInst(MCCFIInstruction::createOffset(
-            GPRSaveLabel, MRI->getDwarfRegNum(Reg, true), Offset));
+        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+        BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
       }
     }
   }
@@ -354,11 +352,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
     emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
 
     // Add CFI for the allocation.
-    MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
-      .addSym(AdjustSPLabel);
-    MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
-        AdjustSPLabel, SPOffsetFromCFA + Delta));
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
     SPOffsetFromCFA += Delta;
   }
 
@@ -368,26 +365,23 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(SystemZ::R15D);
 
     // Add CFI for the new frame location.
-    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
-      .addSym(SetFPLabel);
     unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
-    MMI.addFrameInst(
-        MCCFIInstruction::createDefCfaRegister(SetFPLabel, HardFP));
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
 
     // Mark the FramePtr as live at the beginning of every block except
     // the entry block.  (We'll have marked R11 as live on entry when
     // saving the GPRs.)
-    for (MachineFunction::iterator
-           I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I)
+    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
       I->addLiveIn(SystemZ::R11D);
   }
 
   // Skip over the FPR saves.
-  MCSymbol *FPRSaveLabel = 0;
-  for (std::vector<CalleeSavedInfo>::const_iterator
-         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
-    unsigned Reg = I->getReg();
+  SmallVector<unsigned, 8> CFIIndexes;
+  for (auto &Save : CSI) {
+    unsigned Reg = Save.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       if (MBBI != MBB.end() &&
           (MBBI->getOpcode() == SystemZ::STD ||
@@ -397,25 +391,25 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
         llvm_unreachable("Couldn't skip over FPR save");
 
       // Add CFI for the this save.
-      if (!FPRSaveLabel)
-        FPRSaveLabel = MMI.getContext().CreateTempSymbol();
-      unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
-      int64_t Offset = getFrameIndexOffset(MF, I->getFrameIdx());
-      MMI.addFrameInst(MCCFIInstruction::createOffset(
-          FPRSaveLabel, Reg, SPOffsetFromCFA + Offset));
+      unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+      int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx());
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, DwarfReg, SPOffsetFromCFA + Offset));
+      CFIIndexes.push_back(CFIIndex);
     }
   }
   // Complete the CFI for the FPR saves, modelling them as taking effect
   // after the last save.
-  if (FPRSaveLabel)
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
-      .addSym(FPRSaveLabel);
+  for (auto CFIIndex : CFIIndexes) {
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 }
 
 void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  const SystemZInstrInfo *ZII =
+  auto *ZII =
     static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
 
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 9b0a1d5..70e25fb 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -30,39 +30,31 @@ public:
                        const SystemZSubtarget &sti);
 
   // Override TargetFrameLowering.
-  virtual bool isFPCloseToIncomingSP() const LLVM_OVERRIDE { return false; }
-  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
-    LLVM_OVERRIDE;
-  virtual void
-    processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                         RegScavenger *RS) const LLVM_OVERRIDE;
-  virtual bool
-    spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              const std::vector<CalleeSavedInfo> &CSI,
-                              const TargetRegisterInfo *TRI) const
-    LLVM_OVERRIDE;
-  virtual bool
-    restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBII,
-                                const std::vector<CalleeSavedInfo> &CSI,
-                                const TargetRegisterInfo *TRI) const
-    LLVM_OVERRIDE;
-  virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                                   RegScavenger *RS) const;
-  virtual void emitPrologue(MachineFunction &MF) const LLVM_OVERRIDE;
-  virtual void emitEpilogue(MachineFunction &MF,
-                            MachineBasicBlock &MBB) const LLVM_OVERRIDE;
-  virtual bool hasFP(const MachineFunction &MF) const LLVM_OVERRIDE;
-  virtual int getFrameIndexOffset(const MachineFunction &MF,
-                                  int FI) const LLVM_OVERRIDE;
-  virtual bool hasReservedCallFrame(const MachineFunction &MF) const
-    LLVM_OVERRIDE;
-  virtual void
-  eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const
-    LLVM_OVERRIDE;
+  bool isFPCloseToIncomingSP() const override { return false; }
+  const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
+    override;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBII,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const
+    override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const
+    override;
 
   // Return the number of bytes in the callee-allocated part of the frame.
   uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index f4a2773..f46eb16 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -318,16 +318,14 @@ public:
       Subtarget(*TM.getSubtargetImpl()) { }
 
   // Override MachineFunctionPass.
-  virtual const char *getPassName() const LLVM_OVERRIDE {
+  const char *getPassName() const override {
     return "SystemZ DAG->DAG Pattern Instruction Selection";
   }
 
   // Override SelectionDAGISel.
-  virtual SDNode *Select(SDNode *Node) LLVM_OVERRIDE;
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps)
-    LLVM_OVERRIDE;
+  SDNode *Select(SDNode *Node) override;
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 
   // Include the pieces autogenerated from the target description.
   #include "SystemZGenDAGISel.inc"
@@ -651,8 +649,7 @@ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
     return false;
 
   // We need a constant mask.
-  ConstantSDNode *MaskNode =
-    dyn_cast<ConstantSDNode>(Op.getOperand(1).getNode());
+  auto *MaskNode = dyn_cast<ConstantSDNode>(Op.getOperand(1).getNode());
   if (!MaskNode)
     return false;
 
@@ -704,8 +701,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
     if (RxSBG.Opcode == SystemZ::RNSBG)
       return false;
 
-    ConstantSDNode *MaskNode =
-      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
     if (!MaskNode)
       return false;
 
@@ -729,8 +725,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
     if (RxSBG.Opcode != SystemZ::RNSBG)
       return false;
 
-    ConstantSDNode *MaskNode =
-      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
     if (!MaskNode)
       return false;
 
@@ -754,8 +749,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
     // Any 64-bit rotate left can be merged into the RxSBG.
     if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
       return false;
-    ConstantSDNode *CountNode
-      = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
     if (!CountNode)
       return false;
 
@@ -764,9 +758,24 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
     return true;
   }
       
-  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND:
+    // Bits above the extended operand are don't-care.
+    RxSBG.Input = N.getOperand(0);
+    return true;
+
   case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND: {
+    if (RxSBG.Opcode != SystemZ::RNSBG) {
+      // Restrict the mask to the extended operand.
+      unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
+      if (!refineRxSBGMask(RxSBG, allOnes(InnerBitSize)))
+        return false;
+
+      RxSBG.Input = N.getOperand(0);
+      return true;
+    }
+    // Fall through.
+    
+  case ISD::SIGN_EXTEND: {
     // Check that the extension bits are don't-care (i.e. are masked out
     // by the final mask).
     unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
@@ -778,8 +787,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
   }
 
   case ISD::SHL: {
-    ConstantSDNode *CountNode =
-      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
     if (!CountNode)
       return false;
 
@@ -806,8 +814,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
 
   case ISD::SRL:
   case ISD::SRA: {
-    ConstantSDNode *CountNode =
-      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
     if (!CountNode)
       return false;
 
@@ -876,7 +883,7 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
         SystemZ::isImmLF(~RISBG.Mask) ||
         SystemZ::isImmHF(~RISBG.Mask)) {
       // Force the new mask into the DAG, since it may include known-one bits.
-      ConstantSDNode *MaskN = cast<ConstantSDNode>(N->getOperand(1).getNode());
+      auto *MaskN = cast<ConstantSDNode>(N->getOperand(1).getNode());
       if (MaskN->getZExtValue() != RISBG.Mask) {
         SDValue NewMask = CurDAG->getConstant(RISBG.Mask, VT);
         N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewMask);
@@ -928,7 +935,7 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
 
   // Prefer IC for character insertions from memory.
   if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
-    if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
+    if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
       if (Load->getMemoryVT() == MVT::i8)
         return 0;
 
@@ -996,8 +1003,8 @@ bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
 }
 
 bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
-  StoreSDNode *Store = cast<StoreSDNode>(N);
-  LoadSDNode *Load = cast<LoadSDNode>(Store->getValue());
+  auto *Store = cast<StoreSDNode>(N);
+  auto *Load = cast<LoadSDNode>(Store->getValue());
 
   // Prefer not to use MVC if either address can use ... RELATIVE LONG
   // instructions.
@@ -1016,9 +1023,9 @@ bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
 
 bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
                                                      unsigned I) const {
-  StoreSDNode *StoreA = cast<StoreSDNode>(N);
-  LoadSDNode *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
-  LoadSDNode *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+  auto *StoreA = cast<StoreSDNode>(N);
+  auto *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+  auto *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
   return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
 }
 
@@ -1049,7 +1056,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     // If this is a 64-bit operation in which both 32-bit halves are nonzero,
     // split the operation into two.
     if (!ResNode && Node->getValueType(0) == MVT::i64)
-      if (ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
+      if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
         uint64_t Val = Op1->getZExtValue();
         if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val))
           Node = splitLargeImmediate(Opcode, Node, Node->getOperand(0),
@@ -1064,6 +1071,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   case ISD::ROTL:
   case ISD::SHL:
   case ISD::SRL:
+  case ISD::ZERO_EXTEND:
     if (!ResNode)
       ResNode = tryRISBGZero(Node);
     break;
@@ -1079,20 +1087,6 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
 
-  case ISD::ATOMIC_LOAD_SUB:
-    // Try to convert subtractions of constants to additions.
-    if (ConstantSDNode *Op2 = dyn_cast<ConstantSDNode>(Node->getOperand(2))) {
-      uint64_t Value = -Op2->getZExtValue();
-      EVT VT = Node->getValueType(0);
-      if (VT == MVT::i32 || isInt<32>(Value)) {
-        SDValue Ops[] = { Node->getOperand(0), Node->getOperand(1),
-                          CurDAG->getConstant(int32_t(Value), VT) };
-        Node = CurDAG->MorphNodeTo(Node, ISD::ATOMIC_LOAD_ADD,
-                                   Node->getVTList(), Ops, array_lengthof(Ops));
-      }
-    }
-    break;
-
   case SystemZISD::SELECT_CCMASK: {
     SDValue Op0 = Node->getOperand(0);
     SDValue Op1 = Node->getOperand(1);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index f6e1853..714b6c9 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-
 #include <cctype>
 
 using namespace llvm;
@@ -38,7 +37,28 @@ struct IPMConversion {
   int64_t AddValue;
   unsigned Bit;
 };
-}
+
+// Represents information about a comparison.
+struct Comparison {
+  Comparison(SDValue Op0In, SDValue Op1In)
+    : Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
+
+  // The operands to the comparison.
+  SDValue Op0, Op1;
+
+  // The opcode that should be used to compare Op0 and Op1.
+  unsigned Opcode;
+
+  // A SystemZICMP value.  Only used for integer comparisons.
+  unsigned ICmpType;
+
+  // The mask of CC values that Opcode can produce.
+  unsigned CCValid;
+
+  // The mask of CC values for which the original condition is true.
+  unsigned CCMask;
+};
+} // end anonymous namespace
 
 // Classify VT as either 32 or 64 bit.
 static bool is32Bit(EVT VT) {
@@ -134,10 +154,14 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::SDIVREM, VT, Custom);
       setOperationAction(ISD::UDIVREM, VT, Custom);
 
-      // Expand ATOMIC_LOAD and ATOMIC_STORE using ATOMIC_CMP_SWAP.
-      // FIXME: probably much too conservative.
-      setOperationAction(ISD::ATOMIC_LOAD,  VT, Expand);
-      setOperationAction(ISD::ATOMIC_STORE, VT, Expand);
+      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
+      // stores, putting a serialization instruction after the stores.
+      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
+      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+
+      // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
+      // available, or if the operand is constant.
+      setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 
       // No special instructions for these.
       setOperationAction(ISD::CTPOP,           VT, Expand);
@@ -152,8 +176,9 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::SMUL_LOHI, VT, Custom);
       setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
-      // We have instructions for signed but not unsigned FP conversion.
-      setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+      // Only z196 and above have native support for conversions to unsigned.
+      if (!Subtarget.hasFPExtension())
+        setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     }
   }
 
@@ -173,10 +198,12 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Custom);
 
-  // We have instructions for signed but not unsigned FP conversion.
+  // z10 has instructions for signed but not unsigned FP conversion.
   // Handle unsigned 32-bit types as signed 64-bit types.
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  if (!Subtarget.hasFPExtension()) {
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  }
 
   // We have native support for a 64-bit CTLZ, via FLOGR.
   setOperationAction(ISD::CTLZ, MVT::i32, Promote);
@@ -266,6 +293,9 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
 
+  // Codes for which we want to perform some z-specific combinations.
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+
   // We want to use MVC in preference to even a single load/store pair.
   MaxStoresPerMemcpy = 0;
   MaxStoresPerMemcpyOptSize = 0;
@@ -310,6 +340,7 @@ bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 }
 
 bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                          unsigned,
                                                           bool *Fast) const {
   // Unaligned accesses should never be slower than the expanded version.
   // We check specifically for aligned accesses in the few cases where
@@ -416,31 +447,31 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
     break;
 
   case 'I': // Unsigned 8-bit constant
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
       if (isUInt<8>(C->getZExtValue()))
         weight = CW_Constant;
     break;
 
   case 'J': // Unsigned 12-bit constant
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
       if (isUInt<12>(C->getZExtValue()))
         weight = CW_Constant;
     break;
 
   case 'K': // Signed 16-bit constant
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
       if (isInt<16>(C->getSExtValue()))
         weight = CW_Constant;
     break;
 
   case 'L': // Signed 20-bit displacement (on all targets we support)
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
       if (isInt<20>(C->getSExtValue()))
         weight = CW_Constant;
     break;
 
   case 'M': // 0x7fffffff
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
       if (C->getZExtValue() == 0x7fffffff)
         weight = CW_Constant;
     break;
@@ -533,35 +564,35 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
   if (Constraint.length() == 1) {
     switch (Constraint[0]) {
     case 'I': // Unsigned 8-bit constant
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (isUInt<8>(C->getZExtValue()))
           Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
                                               Op.getValueType()));
       return;
 
     case 'J': // Unsigned 12-bit constant
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (isUInt<12>(C->getZExtValue()))
           Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
                                               Op.getValueType()));
       return;
 
     case 'K': // Signed 16-bit constant
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (isInt<16>(C->getSExtValue()))
           Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
                                               Op.getValueType()));
       return;
 
     case 'L': // Signed 20-bit displacement (on all targets we support)
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (isInt<20>(C->getSExtValue()))
           Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
                                               Op.getValueType()));
       return;
 
     case 'M': // 0x7fffffff
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
         if (C->getZExtValue() == 0x7fffffff)
           Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
                                               Op.getValueType()));
@@ -642,8 +673,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
-  const SystemZFrameLowering *TFL =
-    static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
+  auto *TFL = static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -852,10 +882,10 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // associated Target* opcodes.  Force %r1 to be used for indirect
   // tail calls.
   SDValue Glue;
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
-  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+  } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
   } else if (IsTailCall) {
@@ -970,6 +1000,11 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
                      RetOps.data(), RetOps.size());
 }
 
+SDValue SystemZTargetLowering::
+prepareVolatileOrAtomicLoad(SDValue Chain, SDLoc DL, SelectionDAG &DAG) const {
+  return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
+}
+
 // CC is a comparison that will be implemented using an integer or
 // floating-point comparison.  Return the condition code mask for
 // a branch on true.  In the integer case, CCMASK_CMP_UO is set for
@@ -1044,7 +1079,7 @@ static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
   if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
     return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
 
-  // The remaing cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // The remaining cases are 1, 2, 0/1/3 and 0/2/3.  All these are
   // can be done by inverting the low CC bit and applying one of the
   // sign-based extractions above.
   if (CCMask == (CCValid & SystemZ::CCMASK_1))
@@ -1065,109 +1100,100 @@ static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
   llvm_unreachable("Unexpected CC combination");
 }
 
-// If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
-// can be converted to a comparison against zero, adjust the operands
+// If C can be converted to a comparison against zero, adjust the operands
 // as necessary.
-static void adjustZeroCmp(SelectionDAG &DAG, bool &IsUnsigned,
-                          SDValue &CmpOp0, SDValue &CmpOp1,
-                          unsigned &CCMask) {
-  if (IsUnsigned)
+static void adjustZeroCmp(SelectionDAG &DAG, Comparison &C) {
+  if (C.ICmpType == SystemZICMP::UnsignedOnly)
     return;
 
-  ConstantSDNode *ConstOp1 = dyn_cast<ConstantSDNode>(CmpOp1.getNode());
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
   if (!ConstOp1)
     return;
 
   int64_t Value = ConstOp1->getSExtValue();
-  if ((Value == -1 && CCMask == SystemZ::CCMASK_CMP_GT) ||
-      (Value == -1 && CCMask == SystemZ::CCMASK_CMP_LE) ||
-      (Value == 1 && CCMask == SystemZ::CCMASK_CMP_LT) ||
-      (Value == 1 && CCMask == SystemZ::CCMASK_CMP_GE)) {
-    CCMask ^= SystemZ::CCMASK_CMP_EQ;
-    CmpOp1 = DAG.getConstant(0, CmpOp1.getValueType());
+  if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
+      (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
+      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
+      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
+    C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    C.Op1 = DAG.getConstant(0, C.Op1.getValueType());
   }
 }
 
-// If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
-// is suitable for CLI(Y), CHHSI or CLHHSI, adjust the operands as necessary.
-static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
-                             SDValue &CmpOp0, SDValue &CmpOp1,
-                             unsigned &CCMask) {
+// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
+// adjust the operands as necessary.
+static void adjustSubwordCmp(SelectionDAG &DAG, Comparison &C) {
   // For us to make any changes, it must a comparison between a single-use
   // load and a constant.
-  if (!CmpOp0.hasOneUse() ||
-      CmpOp0.getOpcode() != ISD::LOAD ||
-      CmpOp1.getOpcode() != ISD::Constant)
+  if (!C.Op0.hasOneUse() ||
+      C.Op0.getOpcode() != ISD::LOAD ||
+      C.Op1.getOpcode() != ISD::Constant)
     return;
 
   // We must have an 8- or 16-bit load.
-  LoadSDNode *Load = cast<LoadSDNode>(CmpOp0);
+  auto *Load = cast<LoadSDNode>(C.Op0);
   unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
   if (NumBits != 8 && NumBits != 16)
     return;
 
   // The load must be an extending one and the constant must be within the
   // range of the unextended value.
-  ConstantSDNode *Constant = cast<ConstantSDNode>(CmpOp1);
-  uint64_t Value = Constant->getZExtValue();
+  auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
+  uint64_t Value = ConstOp1->getZExtValue();
   uint64_t Mask = (1 << NumBits) - 1;
   if (Load->getExtensionType() == ISD::SEXTLOAD) {
-    int64_t SignedValue = Constant->getSExtValue();
-    if (uint64_t(SignedValue) + (1ULL << (NumBits - 1)) > Mask)
+    // Make sure that ConstOp1 is in range of C.Op0.
+    int64_t SignedValue = ConstOp1->getSExtValue();
+    if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
       return;
-    // Unsigned comparison between two sign-extended values is equivalent
-    // to unsigned comparison between two zero-extended values.
-    if (IsUnsigned)
+    if (C.ICmpType != SystemZICMP::SignedOnly) {
+      // Unsigned comparison between two sign-extended values is equivalent
+      // to unsigned comparison between two zero-extended values.
       Value &= Mask;
-    else if (CCMask == SystemZ::CCMASK_CMP_EQ ||
-             CCMask == SystemZ::CCMASK_CMP_NE)
-      // Any choice of IsUnsigned is OK for equality comparisons.
-      // We could use either CHHSI or CLHHSI for 16-bit comparisons,
-      // but since we use CLHHSI for zero extensions, it seems better
-      // to be consistent and do the same here.
-      Value &= Mask, IsUnsigned = true;
-    else if (NumBits == 8) {
+    } else if (NumBits == 8) {
       // Try to treat the comparison as unsigned, so that we can use CLI.
       // Adjust CCMask and Value as necessary.
-      if (Value == 0 && CCMask == SystemZ::CCMASK_CMP_LT)
+      if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
         // Test whether the high bit of the byte is set.
-        Value = 127, CCMask = SystemZ::CCMASK_CMP_GT, IsUnsigned = true;
-      else if (Value == 0 && CCMask == SystemZ::CCMASK_CMP_GE)
+        Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
+      else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
         // Test whether the high bit of the byte is clear.
-        Value = 128, CCMask = SystemZ::CCMASK_CMP_LT, IsUnsigned = true;
+        Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
       else
         // No instruction exists for this combination.
         return;
+      C.ICmpType = SystemZICMP::UnsignedOnly;
     }
   } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
     if (Value > Mask)
       return;
-    // Signed comparison between two zero-extended values is equivalent
-    // to unsigned comparison.
-    IsUnsigned = true;
+    assert(C.ICmpType == SystemZICMP::Any &&
+           "Signedness shouldn't matter here.");
   } else
     return;
 
   // Make sure that the first operand is an i32 of the right extension type.
-  ISD::LoadExtType ExtType = IsUnsigned ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
-  if (CmpOp0.getValueType() != MVT::i32 ||
+  ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
+                              ISD::SEXTLOAD :
+                              ISD::ZEXTLOAD);
+  if (C.Op0.getValueType() != MVT::i32 ||
       Load->getExtensionType() != ExtType)
-    CmpOp0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
-                            Load->getChain(), Load->getBasePtr(),
-                            Load->getPointerInfo(), Load->getMemoryVT(),
-                            Load->isVolatile(), Load->isNonTemporal(),
-                            Load->getAlignment());
+    C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
+                           Load->getChain(), Load->getBasePtr(),
+                           Load->getPointerInfo(), Load->getMemoryVT(),
+                           Load->isVolatile(), Load->isNonTemporal(),
+                           Load->getAlignment());
 
   // Make sure that the second operand is an i32 with the right value.
-  if (CmpOp1.getValueType() != MVT::i32 ||
-      Value != Constant->getZExtValue())
-    CmpOp1 = DAG.getConstant(Value, MVT::i32);
+  if (C.Op1.getValueType() != MVT::i32 ||
+      Value != ConstOp1->getZExtValue())
+    C.Op1 = DAG.getConstant(Value, MVT::i32);
 }
 
 // Return true if Op is either an unextended load, or a load suitable
 // for integer register-memory comparisons of type ICmpType.
 static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
-  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op.getNode());
+  auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
   if (Load) {
     // There are no instructions to compare a register with a memory byte.
     if (Load->getMemoryVT() == MVT::i8)
@@ -1187,53 +1213,163 @@ static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
   return false;
 }
 
-// Return true if it is better to swap comparison operands Op0 and Op1.
-// ICmpType is the type of an integer comparison.
-static bool shouldSwapCmpOperands(SDValue Op0, SDValue Op1,
-                                  unsigned ICmpType) {
+// Return true if it is better to swap the operands of C.
+static bool shouldSwapCmpOperands(const Comparison &C) {
   // Leave f128 comparisons alone, since they have no memory forms.
-  if (Op0.getValueType() == MVT::f128)
+  if (C.Op0.getValueType() == MVT::f128)
     return false;
 
   // Always keep a floating-point constant second, since comparisons with
   // zero can use LOAD TEST and comparisons with other constants make a
   // natural memory operand.
-  if (isa<ConstantFPSDNode>(Op1))
+  if (isa<ConstantFPSDNode>(C.Op1))
     return false;
 
   // Never swap comparisons with zero since there are many ways to optimize
   // those later.
-  ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
-  if (COp1 && COp1->getZExtValue() == 0)
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+  if (ConstOp1 && ConstOp1->getZExtValue() == 0)
+    return false;
+
+  // Also keep natural memory operands second if the loaded value is
+  // only used here.  Several comparisons have memory forms.
+  if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
     return false;
 
   // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
   // In that case we generally prefer the memory to be second.
-  if ((isNaturalMemoryOperand(Op0, ICmpType) && Op0.hasOneUse()) &&
-      !(isNaturalMemoryOperand(Op1, ICmpType) && Op1.hasOneUse())) {
+  if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
     // The only exceptions are when the second operand is a constant and
     // we can use things like CHHSI.
-    if (!COp1)
+    if (!ConstOp1)
       return true;
     // The unsigned memory-immediate instructions can handle 16-bit
     // unsigned integers.
-    if (ICmpType != SystemZICMP::SignedOnly &&
-        isUInt<16>(COp1->getZExtValue()))
+    if (C.ICmpType != SystemZICMP::SignedOnly &&
+        isUInt<16>(ConstOp1->getZExtValue()))
       return false;
     // The signed memory-immediate instructions can handle 16-bit
     // signed integers.
-    if (ICmpType != SystemZICMP::UnsignedOnly &&
-        isInt<16>(COp1->getSExtValue()))
+    if (C.ICmpType != SystemZICMP::UnsignedOnly &&
+        isInt<16>(ConstOp1->getSExtValue()))
       return false;
     return true;
   }
+
+  // Try to promote the use of CGFR and CLGFR.
+  unsigned Opcode0 = C.Op0.getOpcode();
+  if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
+    return true;
+  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
+    return true;
+  if (C.ICmpType != SystemZICMP::SignedOnly &&
+      Opcode0 == ISD::AND &&
+      C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
+    return true;
+
   return false;
 }
 
+// Return a version of comparison CC mask CCMask in which the LT and GT
+// actions are swapped.
+static unsigned reverseCCMask(unsigned CCMask) {
+  return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+          (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_UO));
+}
+
+// Check whether C tests for equality between X and Y and whether X - Y
+// or Y - X is also computed.  In that case it's better to compare the
+// result of the subtraction against zero.
+static void adjustForSubtraction(SelectionDAG &DAG, Comparison &C) {
+  if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+      C.CCMask == SystemZ::CCMASK_CMP_NE) {
+    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+      SDNode *N = *I;
+      if (N->getOpcode() == ISD::SUB &&
+          ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
+           (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
+        C.Op0 = SDValue(N, 0);
+        C.Op1 = DAG.getConstant(0, N->getValueType(0));
+        return;
+      }
+    }
+  }
+}
+
+// Check whether C compares a floating-point value with zero and if that
+// floating-point value is also negated.  In this case we can use the
+// negation to set CC, so avoiding separate LOAD AND TEST and
+// LOAD (NEGATIVE/COMPLEMENT) instructions.
+static void adjustForFNeg(Comparison &C) {
+  auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
+  if (C1 && C1->isZero()) {
+    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+      SDNode *N = *I;
+      if (N->getOpcode() == ISD::FNEG) {
+        C.Op0 = SDValue(N, 0);
+        C.CCMask = reverseCCMask(C.CCMask);
+        return;
+      }
+    }
+  }
+}
+
+// Check whether C compares (shl X, 32) with 0 and whether X is
+// also sign-extended.  In that case it is better to test the result
+// of the sign extension using LTGFR.
+//
+// This case is important because InstCombine transforms a comparison
+// with (sext (trunc X)) into a comparison with (shl X, 32).
+static void adjustForLTGFR(Comparison &C) {
+  // Check for a comparison between (shl X, 32) and 0.
+  if (C.Op0.getOpcode() == ISD::SHL &&
+      C.Op0.getValueType() == MVT::i64 &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
+    if (C1 && C1->getZExtValue() == 32) {
+      SDValue ShlOp0 = C.Op0.getOperand(0);
+      // See whether X has any SIGN_EXTEND_INREG uses.
+      for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
+        SDNode *N = *I;
+        if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
+            cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
+          C.Op0 = SDValue(N, 0);
+          return;
+        }
+      }
+    }
+  }
+}
+
+// If C compares the truncation of an extending load, try to compare
+// the untruncated value instead.  This exposes more opportunities to
+// reuse CC.
+static void adjustICmpTruncate(SelectionDAG &DAG, Comparison &C) {
+  if (C.Op0.getOpcode() == ISD::TRUNCATE &&
+      C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
+    if (L->getMemoryVT().getStoreSizeInBits()
+        <= C.Op0.getValueType().getSizeInBits()) {
+      unsigned Type = L->getExtensionType();
+      if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
+          (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
+        C.Op0 = C.Op0.getOperand(0);
+        C.Op1 = DAG.getConstant(0, C.Op0.getValueType());
+      }
+    }
+  }
+}
+
 // Return true if shift operation N has an in-range constant shift value.
 // Store it in ShiftVal if so.
 static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
-  ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
   if (!Shift)
     return false;
 
@@ -1341,118 +1477,143 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
   return 0;
 }
 
-// See whether the comparison (Opcode CmpOp0, CmpOp1, ICmpType) can be
-// implemented as a TEST UNDER MASK instruction when the condition being
-// tested is as described by CCValid and CCMask.  Update the arguments
-// with the TM version if so.
-static void adjustForTestUnderMask(SelectionDAG &DAG, unsigned &Opcode,
-                                   SDValue &CmpOp0, SDValue &CmpOp1,
-                                   unsigned &CCValid, unsigned &CCMask,
-                                   unsigned &ICmpType) {
+// See whether C can be implemented as a TEST UNDER MASK instruction.
+// Update the arguments with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, Comparison &C) {
   // Check that we have a comparison with a constant.
-  ConstantSDNode *ConstCmpOp1 = dyn_cast<ConstantSDNode>(CmpOp1);
-  if (!ConstCmpOp1)
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+  if (!ConstOp1)
     return;
-  uint64_t CmpVal = ConstCmpOp1->getZExtValue();
+  uint64_t CmpVal = ConstOp1->getZExtValue();
 
   // Check whether the nonconstant input is an AND with a constant mask.
-  if (CmpOp0.getOpcode() != ISD::AND)
-    return;
-  SDValue AndOp0 = CmpOp0.getOperand(0);
-  SDValue AndOp1 = CmpOp0.getOperand(1);
-  ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(AndOp1.getNode());
-  if (!Mask)
-    return;
-  uint64_t MaskVal = Mask->getZExtValue();
+  Comparison NewC(C);
+  uint64_t MaskVal;
+  ConstantSDNode *Mask = 0;
+  if (C.Op0.getOpcode() == ISD::AND) {
+    NewC.Op0 = C.Op0.getOperand(0);
+    NewC.Op1 = C.Op0.getOperand(1);
+    Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
+    if (!Mask)
+      return;
+    MaskVal = Mask->getZExtValue();
+  } else {
+    // There is no instruction to compare with a 64-bit immediate
+    // so use TMHH instead if possible.  We need an unsigned ordered
+    // comparison with an i64 immediate.
+    if (NewC.Op0.getValueType() != MVT::i64 ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
+        NewC.ICmpType == SystemZICMP::SignedOnly)
+      return;
+    // Convert LE and GT comparisons into LT and GE.
+    if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
+      if (CmpVal == uint64_t(-1))
+        return;
+      CmpVal += 1;
+      NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    }
+    // If the low N bits of Op1 are zero than the low N bits of Op0 can
+    // be masked off without changing the result.
+    MaskVal = -(CmpVal & -CmpVal);
+    NewC.ICmpType = SystemZICMP::UnsignedOnly;
+  }
 
   // Check whether the combination of mask, comparison value and comparison
   // type are suitable.
-  unsigned BitSize = CmpOp0.getValueType().getSizeInBits();
+  unsigned BitSize = NewC.Op0.getValueType().getSizeInBits();
   unsigned NewCCMask, ShiftVal;
-  if (ICmpType != SystemZICMP::SignedOnly &&
-      AndOp0.getOpcode() == ISD::SHL &&
-      isSimpleShift(AndOp0, ShiftVal) &&
-      (NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal >> ShiftVal,
+  if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+      NewC.Op0.getOpcode() == ISD::SHL &&
+      isSimpleShift(NewC.Op0, ShiftVal) &&
+      (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+                                        MaskVal >> ShiftVal,
                                         CmpVal >> ShiftVal,
                                         SystemZICMP::Any))) {
-    AndOp0 = AndOp0.getOperand(0);
-    AndOp1 = DAG.getConstant(MaskVal >> ShiftVal, AndOp0.getValueType());
-  } else if (ICmpType != SystemZICMP::SignedOnly &&
-             AndOp0.getOpcode() == ISD::SRL &&
-             isSimpleShift(AndOp0, ShiftVal) &&
-             (NewCCMask = getTestUnderMaskCond(BitSize, CCMask,
+    NewC.Op0 = NewC.Op0.getOperand(0);
+    MaskVal >>= ShiftVal;
+  } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+             NewC.Op0.getOpcode() == ISD::SRL &&
+             isSimpleShift(NewC.Op0, ShiftVal) &&
+             (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                                MaskVal << ShiftVal,
                                                CmpVal << ShiftVal,
                                                SystemZICMP::UnsignedOnly))) {
-    AndOp0 = AndOp0.getOperand(0);
-    AndOp1 = DAG.getConstant(MaskVal << ShiftVal, AndOp0.getValueType());
+    NewC.Op0 = NewC.Op0.getOperand(0);
+    MaskVal <<= ShiftVal;
   } else {
-    NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal, CmpVal,
-                                     ICmpType);
+    NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
+                                     NewC.ICmpType);
     if (!NewCCMask)
       return;
   }
 
   // Go ahead and make the change.
-  Opcode = SystemZISD::TM;
-  CmpOp0 = AndOp0;
-  CmpOp1 = AndOp1;
-  ICmpType = (bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
-              bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
-  CCValid = SystemZ::CCMASK_TM;
-  CCMask = NewCCMask;
-}
-
-// Return a target node that compares CmpOp0 with CmpOp1 and stores a
-// 2-bit result in CC.  Set CCValid to the CCMASK_* of all possible
-// 2-bit results and CCMask to the subset of those results that are
-// associated with Cond.
-static SDValue emitCmp(const SystemZTargetMachine &TM, SelectionDAG &DAG,
-                       SDLoc DL, SDValue CmpOp0, SDValue CmpOp1,
-                       ISD::CondCode Cond, unsigned &CCValid,
-                       unsigned &CCMask) {
-  bool IsUnsigned = false;
-  CCMask = CCMaskForCondCode(Cond);
-  unsigned Opcode, ICmpType = 0;
-  if (CmpOp0.getValueType().isFloatingPoint()) {
-    CCValid = SystemZ::CCMASK_FCMP;
-    Opcode = SystemZISD::FCMP;
+  C.Opcode = SystemZISD::TM;
+  C.Op0 = NewC.Op0;
+  if (Mask && Mask->getZExtValue() == MaskVal)
+    C.Op1 = SDValue(Mask, 0);
+  else
+    C.Op1 = DAG.getConstant(MaskVal, C.Op0.getValueType());
+  C.CCValid = SystemZ::CCMASK_TM;
+  C.CCMask = NewCCMask;
+}
+
+// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
+static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+                         ISD::CondCode Cond) {
+  Comparison C(CmpOp0, CmpOp1);
+  C.CCMask = CCMaskForCondCode(Cond);
+  if (C.Op0.getValueType().isFloatingPoint()) {
+    C.CCValid = SystemZ::CCMASK_FCMP;
+    C.Opcode = SystemZISD::FCMP;
+    adjustForFNeg(C);
   } else {
-    IsUnsigned = CCMask & SystemZ::CCMASK_CMP_UO;
-    CCValid = SystemZ::CCMASK_ICMP;
-    CCMask &= CCValid;
-    adjustZeroCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
-    adjustSubwordCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
-    Opcode = SystemZISD::ICMP;
+    C.CCValid = SystemZ::CCMASK_ICMP;
+    C.Opcode = SystemZISD::ICMP;
     // Choose the type of comparison.  Equality and inequality tests can
     // use either signed or unsigned comparisons.  The choice also doesn't
     // matter if both sign bits are known to be clear.  In those cases we
     // want to give the main isel code the freedom to choose whichever
     // form fits best.
-    if (CCMask == SystemZ::CCMASK_CMP_EQ ||
-        CCMask == SystemZ::CCMASK_CMP_NE ||
-        (DAG.SignBitIsZero(CmpOp0) && DAG.SignBitIsZero(CmpOp1)))
-      ICmpType = SystemZICMP::Any;
-    else if (IsUnsigned)
-      ICmpType = SystemZICMP::UnsignedOnly;
+    if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+        C.CCMask == SystemZ::CCMASK_CMP_NE ||
+        (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
+      C.ICmpType = SystemZICMP::Any;
+    else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
+      C.ICmpType = SystemZICMP::UnsignedOnly;
     else
-      ICmpType = SystemZICMP::SignedOnly;
+      C.ICmpType = SystemZICMP::SignedOnly;
+    C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
+    adjustZeroCmp(DAG, C);
+    adjustSubwordCmp(DAG, C);
+    adjustForSubtraction(DAG, C);
+    adjustForLTGFR(C);
+    adjustICmpTruncate(DAG, C);
   }
 
-  if (shouldSwapCmpOperands(CmpOp0, CmpOp1, ICmpType)) {
-    std::swap(CmpOp0, CmpOp1);
-    CCMask = ((CCMask & SystemZ::CCMASK_CMP_EQ) |
-              (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
-              (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
-              (CCMask & SystemZ::CCMASK_CMP_UO));
+  if (shouldSwapCmpOperands(C)) {
+    std::swap(C.Op0, C.Op1);
+    C.CCMask = reverseCCMask(C.CCMask);
   }
 
-  adjustForTestUnderMask(DAG, Opcode, CmpOp0, CmpOp1, CCValid, CCMask,
-                         ICmpType);
-  if (Opcode == SystemZISD::ICMP || Opcode == SystemZISD::TM)
-    return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1,
-                       DAG.getConstant(ICmpType, MVT::i32));
-  return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1);
+  adjustForTestUnderMask(DAG, C);
+  return C;
+}
+
+// Emit the comparison instruction described by C.
+static SDValue emitCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
+  if (C.Opcode == SystemZISD::ICMP)
+    return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+                       DAG.getConstant(C.ICmpType, MVT::i32));
+  if (C.Opcode == SystemZISD::TM) {
+    bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+                         bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+    return DAG.getNode(SystemZISD::TM, DL, MVT::Glue, C.Op0, C.Op1,
+                       DAG.getConstant(RegisterOnly, MVT::i32));
+  }
+  return DAG.getNode(C.Opcode, DL, MVT::Glue, C.Op0, C.Op1);
 }
 
 // Implement a 32-bit *MUL_LOHI operation by extending both operands to
@@ -1486,16 +1647,11 @@ static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
   Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
 }
 
-SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
-                                          SelectionDAG &DAG) const {
-  SDValue CmpOp0   = Op.getOperand(0);
-  SDValue CmpOp1   = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDLoc DL(Op);
-
-  unsigned CCValid, CCMask;
-  SDValue Glue = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
-
+// Return an i32 value that is 1 if the CC value produced by Glue is
+// in the mask CCMask and 0 otherwise.  CC is known to have a value
+// in CCValid, so other values can be ignored.
+static SDValue emitSETCC(SelectionDAG &DAG, SDLoc DL, SDValue Glue,
+                         unsigned CCValid, unsigned CCMask) {
   IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
   SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
 
@@ -1516,6 +1672,18 @@ SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
   return Result;
 }
 
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+  SDValue Glue = emitCmp(DAG, DL, C);
+  return emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+}
+
 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain    = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -1524,11 +1692,33 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest     = Op.getOperand(4);
   SDLoc DL(Op);
 
-  unsigned CCValid, CCMask;
-  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+  SDValue Glue = emitCmp(DAG, DL, C);
   return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
-                     Chain, DAG.getConstant(CCValid, MVT::i32),
-                     DAG.getConstant(CCMask, MVT::i32), Dest, Flags);
+                     Chain, DAG.getConstant(C.CCValid, MVT::i32),
+                     DAG.getConstant(C.CCMask, MVT::i32), Dest, Glue);
+}
+
+// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
+// allowing Pos and Neg to be wider than CmpOp.
+static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
+  return (Neg.getOpcode() == ISD::SUB &&
+          Neg.getOperand(0).getOpcode() == ISD::Constant &&
+          cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
+          Neg.getOperand(1) == Pos &&
+          (Pos == CmpOp ||
+           (Pos.getOpcode() == ISD::SIGN_EXTEND &&
+            Pos.getOperand(0) == CmpOp)));
+}
+
+// Return the absolute or negative absolute of Op; IsNegative decides which.
+static SDValue getAbsolute(SelectionDAG &DAG, SDLoc DL, SDValue Op,
+                           bool IsNegative) {
+  Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
+  if (IsNegative)
+    Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
+                     DAG.getConstant(0, Op.getValueType()), Op);
+  return Op;
 }
 
 SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
@@ -1540,15 +1730,53 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDLoc DL(Op);
 
-  unsigned CCValid, CCMask;
-  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+
+  // Check for absolute and negative-absolute selections, including those
+  // where the comparison value is sign-extended (for LPGFR and LNGFR).
+  // This check supplements the one in DAGCombiner.
+  if (C.Opcode == SystemZISD::ICMP &&
+      C.CCMask != SystemZ::CCMASK_CMP_EQ &&
+      C.CCMask != SystemZ::CCMASK_CMP_NE &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    if (isAbsolute(C.Op0, TrueOp, FalseOp))
+      return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
+    if (isAbsolute(C.Op0, FalseOp, TrueOp))
+      return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
+  }
+
+  SDValue Glue = emitCmp(DAG, DL, C);
+
+  // Special case for handling -1/0 results.  The shifts we use here
+  // should get optimized with the IPM conversion sequence.
+  auto *TrueC = dyn_cast<ConstantSDNode>(TrueOp);
+  auto *FalseC = dyn_cast<ConstantSDNode>(FalseOp);
+  if (TrueC && FalseC) {
+    int64_t TrueVal = TrueC->getSExtValue();
+    int64_t FalseVal = FalseC->getSExtValue();
+    if ((TrueVal == -1 && FalseVal == 0) || (TrueVal == 0 && FalseVal == -1)) {
+      // Invert the condition if we want -1 on false.
+      if (TrueVal == 0)
+        C.CCMask ^= C.CCValid;
+      SDValue Result = emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+      EVT VT = Op.getValueType();
+      // Extend the result to VT.  Upper bits are ignored.
+      if (!is32Bit(VT))
+        Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
+      // Sign-extend from the low bit.
+      SDValue ShAmt = DAG.getConstant(VT.getSizeInBits() - 1, MVT::i32);
+      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Result, ShAmt);
+      return DAG.getNode(ISD::SRA, DL, VT, Shl, ShAmt);
+    }
+  }
 
   SmallVector<SDValue, 5> Ops;
   Ops.push_back(TrueOp);
   Ops.push_back(FalseOp);
-  Ops.push_back(DAG.getConstant(CCValid, MVT::i32));
-  Ops.push_back(DAG.getConstant(CCMask, MVT::i32));
-  Ops.push_back(Flags);
+  Ops.push_back(DAG.getConstant(C.CCValid, MVT::i32));
+  Ops.push_back(DAG.getConstant(C.CCMask, MVT::i32));
+  Ops.push_back(Glue);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
@@ -1949,12 +2177,33 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
                                    MVT::i64, HighOp, Low32);
 }
 
+// Op is an atomic load.  Lower it into a normal volatile load.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
+                        Node->getChain(), Node->getBasePtr(),
+                        Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Op is an atomic store.  Lower it into a normal volatile store followed
+// by a serialization.
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
+                                    Node->getBasePtr(), Node->getMemoryVT(),
+                                    Node->getMemOperand());
+  return SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), MVT::Other,
+                                    Chain), 0);
+}
+
 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
 // two into the fullword ATOMIC_LOADW_* operation given by Opcode.
-SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
-                                                SelectionDAG &DAG,
-                                                unsigned Opcode) const {
-  AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   unsigned Opcode) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
 
   // 32-bit operations need no code outside the main loop.
   EVT NarrowVT = Node->getMemoryVT();
@@ -1972,7 +2221,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
 
   // Convert atomic subtracts of constants into additions.
   if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
-    if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Src2)) {
+    if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
       Opcode = SystemZISD::ATOMIC_LOADW_ADD;
       Src2 = DAG.getConstant(-Const->getSExtValue(), Src2.getValueType());
     }
@@ -2023,11 +2272,49 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
   return DAG.getMergeValues(RetOps, 2, DL);
 }
 
+// Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
+// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
+// operations into additions.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  EVT MemVT = Node->getMemoryVT();
+  if (MemVT == MVT::i32 || MemVT == MVT::i64) {
+    // A full-width operation.
+    assert(Op.getValueType() == MemVT && "Mismatched VTs");
+    SDValue Src2 = Node->getVal();
+    SDValue NegSrc2;
+    SDLoc DL(Src2);
+
+    if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
+      // Use an addition if the operand is constant and either LAA(G) is
+      // available or the negative value is in the range of A(G)FHI.
+      int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
+      if (isInt<32>(Value) || TM.getSubtargetImpl()->hasInterlockedAccess1())
+        NegSrc2 = DAG.getConstant(Value, MemVT);
+    } else if (TM.getSubtargetImpl()->hasInterlockedAccess1())
+      // Use LAA(G) if available.
+      NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, MemVT),
+                            Src2);
+
+    if (NegSrc2.getNode())
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
+                           Node->getChain(), Node->getBasePtr(), NegSrc2,
+                           Node->getMemOperand(), Node->getOrdering(),
+                           Node->getSynchScope());
+
+    // Use the node as-is.
+    return Op;
+  }
+
+  return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+}
+
 // Node is an 8- or 16-bit ATOMIC_CMP_SWAP operation.  Lower the first two
 // into a fullword ATOMIC_CMP_SWAPW operation.
 SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
                                                     SelectionDAG &DAG) const {
-  AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
 
   // We have native support for 32-bit compare and swap.
   EVT NarrowVT = Node->getMemoryVT();
@@ -2094,7 +2381,7 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
 
   bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
-  MemIntrinsicSDNode *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+  auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
   SDValue Ops[] = {
     Op.getOperand(0),
     DAG.getConstant(Code, MVT::i32),
@@ -2143,27 +2430,31 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::OR:
     return lowerOR(Op, DAG);
   case ISD::ATOMIC_SWAP:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+  case ISD::ATOMIC_STORE:
+    return lowerATOMIC_STORE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+    return lowerATOMIC_LOAD(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+    return lowerATOMIC_LOAD_SUB(Op, DAG);
   case ISD::ATOMIC_LOAD_AND:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
   case ISD::ATOMIC_LOAD_OR:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
   case ISD::ATOMIC_LOAD_XOR:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
   case ISD::ATOMIC_LOAD_NAND:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
   case ISD::ATOMIC_LOAD_MIN:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
   case ISD::ATOMIC_LOAD_MAX:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
   case ISD::ATOMIC_LOAD_UMIN:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
   case ISD::ATOMIC_LOAD_UMAX:
-    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
   case ISD::ATOMIC_CMP_SWAP:
     return lowerATOMIC_CMP_SWAP(Op, DAG);
   case ISD::STACKSAVE:
@@ -2185,6 +2476,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SIBCALL);
     OPCODE(PCREL_WRAPPER);
     OPCODE(PCREL_OFFSET);
+    OPCODE(IABS);
     OPCODE(ICMP);
     OPCODE(FCMP);
     OPCODE(TM);
@@ -2210,6 +2502,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(STPCPY);
     OPCODE(SEARCH_STRING);
     OPCODE(IPM);
+    OPCODE(SERIALIZE);
     OPCODE(ATOMIC_SWAPW);
     OPCODE(ATOMIC_LOADW_ADD);
     OPCODE(ATOMIC_LOADW_SUB);
@@ -2228,6 +2521,39 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 #undef OPCODE
 }
 
+SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::SIGN_EXTEND) {
+    // Convert (sext (ashr (shl X, C1), C2)) to
+    // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
+    // cheap as narrower ones.
+    SDValue N0 = N->getOperand(0);
+    EVT VT = N->getValueType(0);
+    if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
+      auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      SDValue Inner = N0.getOperand(0);
+      if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
+        if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
+          unsigned Extra = (VT.getSizeInBits() -
+                            N0.getValueType().getSizeInBits());
+          unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
+          unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
+          EVT ShiftVT = N0.getOperand(1).getValueType();
+          SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
+                                    Inner.getOperand(0));
+          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
+                                    DAG.getConstant(NewShlAmt, ShiftVT));
+          return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
+                             DAG.getConstant(NewSraAmt, ShiftVT));
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 //===----------------------------------------------------------------------===//
 // Custom insertion
 //===----------------------------------------------------------------------===//
@@ -2236,7 +2562,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
   MachineFunction &MF = *MBB->getParent();
   MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
-  MF.insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB);
+  MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
   return NewMBB;
 }
 
@@ -2246,8 +2572,7 @@ static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
                                           MachineBasicBlock *MBB) {
   MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
   NewMBB->splice(NewMBB->begin(), MBB,
-                 llvm::next(MachineBasicBlock::iterator(MI)),
-                 MBB->end());
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
   return NewMBB;
 }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index c6dcca6..bceb25e 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -22,170 +22,176 @@
 
 namespace llvm {
 namespace SystemZISD {
-  enum {
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    // Return with a flag operand.  Operand 0 is the chain operand.
-    RET_FLAG,
-
-    // Calls a function.  Operand 0 is the chain operand and operand 1
-    // is the target address.  The arguments start at operand 2.
-    // There is an optional glue operand at the end.
-    CALL,
-    SIBCALL,
-
-    // Wraps a TargetGlobalAddress that should be loaded using PC-relative
-    // accesses (LARL).  Operand 0 is the address.
-    PCREL_WRAPPER,
-
-    // Used in cases where an offset is applied to a TargetGlobalAddress.
-    // Operand 0 is the full TargetGlobalAddress and operand 1 is a
-    // PCREL_WRAPPER for an anchor point.  This is used so that we can
-    // cheaply refer to either the full address or the anchor point
-    // as a register base.
-    PCREL_OFFSET,
-
-    // Integer comparisons.  There are three operands: the two values
-    // to compare, and an integer of type SystemZICMP.
-    ICMP,
-
-    // Floating-point comparisons.  The two operands are the values to compare.
-    FCMP,
-
-    // Test under mask.  The first operand is ANDed with the second operand
-    // and the condition codes are set on the result.  The third operand is
-    // a boolean that is true if the condition codes need to distinguish
-    // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
-    // register forms do but the memory forms don't).
-    TM,
-
-    // Branches if a condition is true.  Operand 0 is the chain operand;
-    // operand 1 is the 4-bit condition-code mask, with bit N in
-    // big-endian order meaning "branch if CC=N"; operand 2 is the
-    // target block and operand 3 is the flag operand.
-    BR_CCMASK,
-
-    // Selects between operand 0 and operand 1.  Operand 2 is the
-    // mask of condition-code values for which operand 0 should be
-    // chosen over operand 1; it has the same form as BR_CCMASK.
-    // Operand 3 is the flag operand.
-    SELECT_CCMASK,
-
-    // Evaluates to the gap between the stack pointer and the
-    // base of the dynamically-allocatable area.
-    ADJDYNALLOC,
-
-    // Extracts the value of a 32-bit access register.  Operand 0 is
-    // the number of the register.
-    EXTRACT_ACCESS,
-
-    // Wrappers around the ISD opcodes of the same name.  The output and
-    // first input operands are GR128s.  The trailing numbers are the
-    // widths of the second operand in bits.
-    UMUL_LOHI64,
-    SDIVREM32,
-    SDIVREM64,
-    UDIVREM32,
-    UDIVREM64,
-
-    // Use a series of MVCs to copy bytes from one memory location to another.
-    // The operands are:
-    // - the target address
-    // - the source address
-    // - the constant length
-    //
-    // This isn't a memory opcode because we'd need to attach two
-    // MachineMemOperands rather than one.
-    MVC,
-
-    // Like MVC, but implemented as a loop that handles X*256 bytes
-    // followed by straight-line code to handle the rest (if any).
-    // The value of X is passed as an additional operand.
-    MVC_LOOP,
-
-    // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
-    NC,
-    NC_LOOP,
-    OC,
-    OC_LOOP,
-    XC,
-    XC_LOOP,
-
-    // Use CLC to compare two blocks of memory, with the same comments
-    // as for MVC and MVC_LOOP.
-    CLC,
-    CLC_LOOP,
-
-    // Use an MVST-based sequence to implement stpcpy().
-    STPCPY,
-
-    // Use a CLST-based sequence to implement strcmp().  The two input operands
-    // are the addresses of the strings to compare.
-    STRCMP,
-
-    // Use an SRST-based sequence to search a block of memory.  The first
-    // operand is the end address, the second is the start, and the third
-    // is the character to search for.  CC is set to 1 on success and 2
-    // on failure.
-    SEARCH_STRING,
-
-    // Store the CC value in bits 29 and 28 of an integer.
-    IPM,
-
-    // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
-    // ATOMIC_LOAD_<op>.
-    //
-    // Operand 0: the address of the containing 32-bit-aligned field
-    // Operand 1: the second operand of <op>, in the high bits of an i32
-    //            for everything except ATOMIC_SWAPW
-    // Operand 2: how many bits to rotate the i32 left to bring the first
-    //            operand into the high bits
-    // Operand 3: the negative of operand 2, for rotating the other way
-    // Operand 4: the width of the field in bits (8 or 16)
-    ATOMIC_SWAPW = ISD::FIRST_TARGET_MEMORY_OPCODE,
-    ATOMIC_LOADW_ADD,
-    ATOMIC_LOADW_SUB,
-    ATOMIC_LOADW_AND,
-    ATOMIC_LOADW_OR,
-    ATOMIC_LOADW_XOR,
-    ATOMIC_LOADW_NAND,
-    ATOMIC_LOADW_MIN,
-    ATOMIC_LOADW_MAX,
-    ATOMIC_LOADW_UMIN,
-    ATOMIC_LOADW_UMAX,
-
-    // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
-    //
-    // Operand 0: the address of the containing 32-bit-aligned field
-    // Operand 1: the compare value, in the low bits of an i32
-    // Operand 2: the swap value, in the low bits of an i32
-    // Operand 3: how many bits to rotate the i32 left to bring the first
-    //            operand into the high bits
-    // Operand 4: the negative of operand 2, for rotating the other way
-    // Operand 5: the width of the field in bits (8 or 16)
-    ATOMIC_CMP_SWAPW,
-
-    // Prefetch from the second operand using the 4-bit control code in
-    // the first operand.  The code is 1 for a load prefetch and 2 for
-    // a store prefetch.
-    PREFETCH
-  };
-
-  // Return true if OPCODE is some kind of PC-relative address.
-  inline bool isPCREL(unsigned Opcode) {
-    return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
-  }
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  // Return with a flag operand.  Operand 0 is the chain operand.
+  RET_FLAG,
+
+  // Calls a function.  Operand 0 is the chain operand and operand 1
+  // is the target address.  The arguments start at operand 2.
+  // There is an optional glue operand at the end.
+  CALL,
+  SIBCALL,
+
+  // Wraps a TargetGlobalAddress that should be loaded using PC-relative
+  // accesses (LARL).  Operand 0 is the address.
+  PCREL_WRAPPER,
+
+  // Used in cases where an offset is applied to a TargetGlobalAddress.
+  // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+  // PCREL_WRAPPER for an anchor point.  This is used so that we can
+  // cheaply refer to either the full address or the anchor point
+  // as a register base.
+  PCREL_OFFSET,
+
+  // Integer absolute.
+  IABS,
+
+  // Integer comparisons.  There are three operands: the two values
+  // to compare, and an integer of type SystemZICMP.
+  ICMP,
+
+  // Floating-point comparisons.  The two operands are the values to compare.
+  FCMP,
+
+  // Test under mask.  The first operand is ANDed with the second operand
+  // and the condition codes are set on the result.  The third operand is
+  // a boolean that is true if the condition codes need to distinguish
+  // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+  // register forms do but the memory forms don't).
+  TM,
+
+  // Branches if a condition is true.  Operand 0 is the chain operand;
+  // operand 1 is the 4-bit condition-code mask, with bit N in
+  // big-endian order meaning "branch if CC=N"; operand 2 is the
+  // target block and operand 3 is the flag operand.
+  BR_CCMASK,
+
+  // Selects between operand 0 and operand 1.  Operand 2 is the
+  // mask of condition-code values for which operand 0 should be
+  // chosen over operand 1; it has the same form as BR_CCMASK.
+  // Operand 3 is the flag operand.
+  SELECT_CCMASK,
+
+  // Evaluates to the gap between the stack pointer and the
+  // base of the dynamically-allocatable area.
+  ADJDYNALLOC,
+
+  // Extracts the value of a 32-bit access register.  Operand 0 is
+  // the number of the register.
+  EXTRACT_ACCESS,
+
+  // Wrappers around the ISD opcodes of the same name.  The output and
+  // first input operands are GR128s.  The trailing numbers are the
+  // widths of the second operand in bits.
+  UMUL_LOHI64,
+  SDIVREM32,
+  SDIVREM64,
+  UDIVREM32,
+  UDIVREM64,
+
+  // Use a series of MVCs to copy bytes from one memory location to another.
+  // The operands are:
+  // - the target address
+  // - the source address
+  // - the constant length
+  //
+  // This isn't a memory opcode because we'd need to attach two
+  // MachineMemOperands rather than one.
+  MVC,
+
+  // Like MVC, but implemented as a loop that handles X*256 bytes
+  // followed by straight-line code to handle the rest (if any).
+  // The value of X is passed as an additional operand.
+  MVC_LOOP,
+
+  // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+  NC,
+  NC_LOOP,
+  OC,
+  OC_LOOP,
+  XC,
+  XC_LOOP,
+
+  // Use CLC to compare two blocks of memory, with the same comments
+  // as for MVC and MVC_LOOP.
+  CLC,
+  CLC_LOOP,
+
+  // Use an MVST-based sequence to implement stpcpy().
+  STPCPY,
+
+  // Use a CLST-based sequence to implement strcmp().  The two input operands
+  // are the addresses of the strings to compare.
+  STRCMP,
+
+  // Use an SRST-based sequence to search a block of memory.  The first
+  // operand is the end address, the second is the start, and the third
+  // is the character to search for.  CC is set to 1 on success and 2
+  // on failure.
+  SEARCH_STRING,
+
+  // Store the CC value in bits 29 and 28 of an integer.
+  IPM,
+
+  // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
+  SERIALIZE,
+
+  // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+  // ATOMIC_LOAD_<op>.
+  //
+  // Operand 0: the address of the containing 32-bit-aligned field
+  // Operand 1: the second operand of <op>, in the high bits of an i32
+  //            for everything except ATOMIC_SWAPW
+  // Operand 2: how many bits to rotate the i32 left to bring the first
+  //            operand into the high bits
+  // Operand 3: the negative of operand 2, for rotating the other way
+  // Operand 4: the width of the field in bits (8 or 16)
+  ATOMIC_SWAPW = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  ATOMIC_LOADW_ADD,
+  ATOMIC_LOADW_SUB,
+  ATOMIC_LOADW_AND,
+  ATOMIC_LOADW_OR,
+  ATOMIC_LOADW_XOR,
+  ATOMIC_LOADW_NAND,
+  ATOMIC_LOADW_MIN,
+  ATOMIC_LOADW_MAX,
+  ATOMIC_LOADW_UMIN,
+  ATOMIC_LOADW_UMAX,
+
+  // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+  //
+  // Operand 0: the address of the containing 32-bit-aligned field
+  // Operand 1: the compare value, in the low bits of an i32
+  // Operand 2: the swap value, in the low bits of an i32
+  // Operand 3: how many bits to rotate the i32 left to bring the first
+  //            operand into the high bits
+  // Operand 4: the negative of operand 2, for rotating the other way
+  // Operand 5: the width of the field in bits (8 or 16)
+  ATOMIC_CMP_SWAPW,
+
+  // Prefetch from the second operand using the 4-bit control code in
+  // the first operand.  The code is 1 for a load prefetch and 2 for
+  // a store prefetch.
+  PREFETCH
+};
+
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
 }
+} // end namespace SystemZISD
 
 namespace SystemZICMP {
-  // Describes whether an integer comparison needs to be signed or unsigned,
-  // or whether either type is OK.
-  enum {
-    Any,
-    UnsignedOnly,
-    SignedOnly
-  };
-}
+// Describes whether an integer comparison needs to be signed or unsigned,
+// or whether either type is OK.
+enum {
+  Any,
+  UnsignedOnly,
+  SignedOnly
+};
+} // end namespace SystemZICMP
 
 class SystemZSubtarget;
 class SystemZTargetMachine;
@@ -195,55 +201,51 @@ public:
   explicit SystemZTargetLowering(SystemZTargetMachine &TM);
 
   // Override TargetLowering.
-  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const LLVM_OVERRIDE {
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override {
     return MVT::i32;
   }
-  virtual EVT getSetCCResultType(LLVMContext &, EVT) const LLVM_OVERRIDE;
-  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const LLVM_OVERRIDE;
-  virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const LLVM_OVERRIDE;
-  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const
-     LLVM_OVERRIDE;
-  virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const
-    LLVM_OVERRIDE;
-  virtual bool isTruncateFree(Type *, Type *) const LLVM_OVERRIDE;
-  virtual bool isTruncateFree(EVT, EVT) const LLVM_OVERRIDE;
-  virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE;
-  virtual std::pair<unsigned, const TargetRegisterClass *>
+  EVT getSetCCResultType(LLVMContext &, EVT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                     bool *Fast) const override;
+  bool isTruncateFree(Type *, Type *) const override;
+  bool isTruncateFree(EVT, EVT) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const LLVM_OVERRIDE;
-  virtual TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const LLVM_OVERRIDE;
-  virtual TargetLowering::ConstraintWeight
+                                 MVT VT) const override;
+  TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const override;
+  TargetLowering::ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                   const char *constraint) const LLVM_OVERRIDE;
-  virtual void
-    LowerAsmOperandForConstraint(SDValue Op,
-                                 std::string &Constraint,
-                                 std::vector<SDValue> &Ops,
-                                 SelectionDAG &DAG) const LLVM_OVERRIDE;
-  virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI,
-                                MachineBasicBlock *BB) const LLVM_OVERRIDE;
-  virtual SDValue LowerOperation(SDValue Op,
-                                 SelectionDAG &DAG) const LLVM_OVERRIDE;
-  virtual bool allowTruncateForTailCall(Type *, Type *) const LLVM_OVERRIDE;
-  virtual bool mayBeEmittedAsTailCall(CallInst *CI) const LLVM_OVERRIDE;
-  virtual SDValue
-    LowerFormalArguments(SDValue Chain,
-                         CallingConv::ID CallConv, bool isVarArg,
-                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                         SDLoc DL, SelectionDAG &DAG,
-                         SmallVectorImpl<SDValue> &InVals) const LLVM_OVERRIDE;
-  virtual SDValue
-    LowerCall(CallLoweringInfo &CLI,
-              SmallVectorImpl<SDValue> &InVals) const LLVM_OVERRIDE;
-
-  virtual SDValue
-    LowerReturn(SDValue Chain,
-                CallingConv::ID CallConv, bool IsVarArg,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                SDLoc DL, SelectionDAG &DAG) const LLVM_OVERRIDE;
+                                   const char *constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+  MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const
+    override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  bool allowTruncateForTailCall(Type *, Type *) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SDLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, SDLoc DL,
+                                      SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
 private:
   const SystemZSubtarget &Subtarget;
@@ -270,9 +272,13 @@ private:
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG,
-                           unsigned Opcode) const;
+  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
+                              unsigned Opcode) const;
+  SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
index fb699b9..84196e9 100644
--- a/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -43,6 +43,6 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
   return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 6080046..a1e782c 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -46,9 +46,9 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
   defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
   defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
 }
-def : CompareZeroFP<LTEBRCompare, FP32>;
-def : CompareZeroFP<LTDBRCompare, FP64>;
-def : CompareZeroFP<LTXBRCompare, FP128>;
+defm : CompareZeroFP<LTEBRCompare, FP32>;
+defm : CompareZeroFP<LTDBRCompare, FP64>;
+defm : CompareZeroFP<LTXBRCompare, FP128>;
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
@@ -157,6 +157,25 @@ def CEGBR : UnaryRRE<"cegb", 0xB3A4, sint_to_fp, FP32,  GR64>;
 def CDGBR : UnaryRRE<"cdgb", 0xB3A5, sint_to_fp, FP64,  GR64>;
 def CXGBR : UnaryRRE<"cxgb", 0xB3A6, sint_to_fp, FP128, GR64>;
 
+// Convert am unsigned integer register value to a floating-point one.
+let Predicates = [FeatureFPExtension] in {
+  def CELFBR : UnaryRRF4<"celfbr", 0xB390, FP32,  GR32>;
+  def CDLFBR : UnaryRRF4<"cdlfbr", 0xB391, FP64,  GR32>;
+  def CXLFBR : UnaryRRF4<"cxlfbr", 0xB392, FP128, GR32>;
+
+  def CELGBR : UnaryRRF4<"celgbr", 0xB3A0, FP32,  GR64>;
+  def CDLGBR : UnaryRRF4<"cdlgbr", 0xB3A1, FP64,  GR64>;
+  def CXLGBR : UnaryRRF4<"cxlgbr", 0xB3A2, FP128, GR64>;
+
+  def : Pat<(f32  (uint_to_fp GR32:$src)), (CELFBR 0, GR32:$src, 0)>;
+  def : Pat<(f64  (uint_to_fp GR32:$src)), (CDLFBR 0, GR32:$src, 0)>;
+  def : Pat<(f128 (uint_to_fp GR32:$src)), (CXLFBR 0, GR32:$src, 0)>;
+
+  def : Pat<(f32  (uint_to_fp GR64:$src)), (CELGBR 0, GR64:$src, 0)>;
+  def : Pat<(f64  (uint_to_fp GR64:$src)), (CDLGBR 0, GR64:$src, 0)>;
+  def : Pat<(f128 (uint_to_fp GR64:$src)), (CXLGBR 0, GR64:$src, 0)>;
+}
+
 // Convert a floating-point register value to a signed integer value,
 // with the second operand (modifier M3) specifying the rounding mode.
 let Defs = [CC] in {
@@ -178,6 +197,28 @@ def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
 def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
 def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
 
+// Convert a floating-point register value to an unsigned integer value.
+let Predicates = [FeatureFPExtension] in {
+  let Defs = [CC] in {
+    def CLFEBR : UnaryRRF4<"clfebr", 0xB39C, GR32, FP32>;
+    def CLFDBR : UnaryRRF4<"clfdbr", 0xB39D, GR32, FP64>;
+    def CLFXBR : UnaryRRF4<"clfxbr", 0xB39E, GR32, FP128>;
+
+    def CLGEBR : UnaryRRF4<"clgebr", 0xB3AC, GR64, FP32>;
+    def CLGDBR : UnaryRRF4<"clgdbr", 0xB3AD, GR64, FP64>;
+    def CLGXBR : UnaryRRF4<"clgxbr", 0xB3AE, GR64, FP128>;
+  }
+
+  def : Pat<(i32 (fp_to_uint FP32:$src)),  (CLFEBR 5, FP32:$src,  0)>;
+  def : Pat<(i32 (fp_to_uint FP64:$src)),  (CLFDBR 5, FP64:$src,  0)>;
+  def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
+
+  def : Pat<(i64 (fp_to_uint FP32:$src)),  (CLGEBR 5, FP32:$src,  0)>;
+  def : Pat<(i64 (fp_to_uint FP64:$src)),  (CLGDBR 5, FP64:$src,  0)>;
+  def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Unary arithmetic
 //===----------------------------------------------------------------------===//
@@ -217,15 +258,6 @@ def FIEBR : UnaryRRF<"fieb", 0xB357, FP32,  FP32>;
 def FIDBR : UnaryRRF<"fidb", 0xB35F, FP64,  FP64>;
 def FIXBR : UnaryRRF<"fixb", 0xB347, FP128, FP128>;
 
-// Extended forms of the previous three instructions.  M4 can be set to 4
-// to suppress detection of inexact conditions.
-def FIEBRA : UnaryRRF4<"fiebra", 0xB357, FP32,  FP32>,
-             Requires<[FeatureFPExtension]>;
-def FIDBRA : UnaryRRF4<"fidbra", 0xB35F, FP64,  FP64>,
-             Requires<[FeatureFPExtension]>;
-def FIXBRA : UnaryRRF4<"fixbra", 0xB347, FP128, FP128>,
-             Requires<[FeatureFPExtension]>;
-
 // frint rounds according to the current mode (modifier 0) and detects
 // inexact conditions.
 def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
@@ -233,6 +265,12 @@ def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
 def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
 
 let Predicates = [FeatureFPExtension] in {
+  // Extended forms of the FIxBR instructions.  M4 can be set to 4
+  // to suppress detection of inexact conditions.
+  def FIEBRA : UnaryRRF4<"fiebra", 0xB357, FP32,  FP32>;
+  def FIDBRA : UnaryRRF4<"fidbra", 0xB35F, FP64,  FP64>;
+  def FIXBRA : UnaryRRF4<"fixbra", 0xB347, FP128, FP128>;
+
   // fnearbyint is like frint but does not detect inexact conditions.
   def : Pat<(fnearbyint FP32:$src),  (FIEBRA 0, FP32:$src,  4)>;
   def : Pat<(fnearbyint FP64:$src),  (FIDBRA 0, FP64:$src,  4)>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index a8efe16..50badf8 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -531,6 +531,10 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //   Ternary:
 //     One register output operand and three register input operands.
 //
+//   LoadAndOp:
+//     One output operand and two input operands.  The first input operand
+//     is a register and the second is an address.
+//
 //   CmpSwap:
 //     One output operand and three input operands.  The first two
 //     operands are registers and the third is an address.  The instruction
@@ -1267,6 +1271,15 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let AccessBytes = bytes;
 }
 
+class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  RegisterOperand cls, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator mode:$BD2, cls:$R3))]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls, AddressingMode mode = bdaddr12only>
   : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index acfeed8..e20834c 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZInstrInfo.h"
-#include "SystemZTargetMachine.h"
 #include "SystemZInstrBuilder.h"
+#include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -53,7 +53,7 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineFunction &MF = *MBB->getParent();
 
   // Get two load or store instructions.  Use the original instruction for one
-  // of them (arbitarily the second here) and create a clone for the other.
+  // of them (arbitrarily the second here) and create a clone for the other.
   MachineInstr *EarlierMI = MF.CloneMachineInstr(MI);
   MBB->insert(MI, EarlierMI);
 
@@ -280,8 +280,8 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (llvm::next(I) != MBB.end())
-        llvm::next(I)->eraseFromParent();
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
 
       Cond.clear();
       FBB = 0;
@@ -628,16 +628,16 @@ static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
 }
 
 namespace {
-  struct LogicOp {
-    LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
-    LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
-      : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
+struct LogicOp {
+  LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+  LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
+    : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
 
-    operator bool() const { return RegSize; }
+  operator bool() const { return RegSize; }
 
-    unsigned RegSize, ImmLSB, ImmSize;
-  };
-}
+  unsigned RegSize, ImmLSB, ImmSize;
+};
+} // end anonymous namespace
 
 static LogicOp interpretAndImmediate(unsigned Opcode) {
   switch (Opcode) {
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index be4c8fe..55f80af 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -26,89 +26,89 @@ namespace llvm {
 class SystemZTargetMachine;
 
 namespace SystemZII {
-  enum {
-    // See comments in SystemZInstrFormats.td.
-    SimpleBDXLoad          = (1 << 0),
-    SimpleBDXStore         = (1 << 1),
-    Has20BitOffset         = (1 << 2),
-    HasIndex               = (1 << 3),
-    Is128Bit               = (1 << 4),
-    AccessSizeMask         = (31 << 5),
-    AccessSizeShift        = 5,
-    CCValuesMask           = (15 << 10),
-    CCValuesShift          = 10,
-    CompareZeroCCMaskMask  = (15 << 14),
-    CompareZeroCCMaskShift = 14,
-    CCMaskFirst            = (1 << 18),
-    CCMaskLast             = (1 << 19),
-    IsLogical              = (1 << 20)
-  };
-  static inline unsigned getAccessSize(unsigned int Flags) {
-    return (Flags & AccessSizeMask) >> AccessSizeShift;
-  }
-  static inline unsigned getCCValues(unsigned int Flags) {
-    return (Flags & CCValuesMask) >> CCValuesShift;
-  }
-  static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
-    return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
-  }
-
-  // SystemZ MachineOperand target flags.
-  enum {
-    // Masks out the bits for the access model.
-    MO_SYMBOL_MODIFIER = (1 << 0),
-
-    // @GOT (aka @GOTENT)
-    MO_GOT = (1 << 0)
-  };
-  // Classifies a branch.
-  enum BranchType {
-    // An instruction that branches on the current value of CC.
-    BranchNormal,
-
-    // An instruction that peforms a 32-bit signed comparison and branches
-    // on the result.
-    BranchC,
-
-    // An instruction that peforms a 32-bit unsigned comparison and branches
-    // on the result.
-    BranchCL,
-
-    // An instruction that peforms a 64-bit signed comparison and branches
-    // on the result.
-    BranchCG,
-
-    // An instruction that peforms a 64-bit unsigned comparison and branches
-    // on the result.
-    BranchCLG,
+enum {
+  // See comments in SystemZInstrFormats.td.
+  SimpleBDXLoad          = (1 << 0),
+  SimpleBDXStore         = (1 << 1),
+  Has20BitOffset         = (1 << 2),
+  HasIndex               = (1 << 3),
+  Is128Bit               = (1 << 4),
+  AccessSizeMask         = (31 << 5),
+  AccessSizeShift        = 5,
+  CCValuesMask           = (15 << 10),
+  CCValuesShift          = 10,
+  CompareZeroCCMaskMask  = (15 << 14),
+  CompareZeroCCMaskShift = 14,
+  CCMaskFirst            = (1 << 18),
+  CCMaskLast             = (1 << 19),
+  IsLogical              = (1 << 20)
+};
+static inline unsigned getAccessSize(unsigned int Flags) {
+  return (Flags & AccessSizeMask) >> AccessSizeShift;
+}
+static inline unsigned getCCValues(unsigned int Flags) {
+  return (Flags & CCValuesMask) >> CCValuesShift;
+}
+static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
+  return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
+}
 
-    // An instruction that decrements a 32-bit register and branches if
-    // the result is nonzero.
-    BranchCT,
+// SystemZ MachineOperand target flags.
+enum {
+  // Masks out the bits for the access model.
+  MO_SYMBOL_MODIFIER = (1 << 0),
 
-    // An instruction that decrements a 64-bit register and branches if
-    // the result is nonzero.
-    BranchCTG
-  };
-  // Information about a branch instruction.
-  struct Branch {
-    // The type of the branch.
-    BranchType Type;
+  // @GOT (aka @GOTENT)
+  MO_GOT = (1 << 0)
+};
+// Classifies a branch.
+enum BranchType {
+  // An instruction that branches on the current value of CC.
+  BranchNormal,
+
+  // An instruction that peforms a 32-bit signed comparison and branches
+  // on the result.
+  BranchC,
+
+  // An instruction that peforms a 32-bit unsigned comparison and branches
+  // on the result.
+  BranchCL,
+
+  // An instruction that peforms a 64-bit signed comparison and branches
+  // on the result.
+  BranchCG,
+
+  // An instruction that peforms a 64-bit unsigned comparison and branches
+  // on the result.
+  BranchCLG,
+
+  // An instruction that decrements a 32-bit register and branches if
+  // the result is nonzero.
+  BranchCT,
+
+  // An instruction that decrements a 64-bit register and branches if
+  // the result is nonzero.
+  BranchCTG
+};
+// Information about a branch instruction.
+struct Branch {
+  // The type of the branch.
+  BranchType Type;
 
-    // CCMASK_<N> is set if CC might be equal to N.
-    unsigned CCValid;
+  // CCMASK_<N> is set if CC might be equal to N.
+  unsigned CCValid;
 
-    // CCMASK_<N> is set if the branch should be taken when CC == N.
-    unsigned CCMask;
+  // CCMASK_<N> is set if the branch should be taken when CC == N.
+  unsigned CCMask;
 
-    // The target of the branch.
-    const MachineOperand *Target;
+  // The target of the branch.
+  const MachineOperand *Target;
 
-    Branch(BranchType type, unsigned ccValid, unsigned ccMask,
-           const MachineOperand *target)
-      : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
-  };
-}
+  Branch(BranchType type, unsigned ccValid, unsigned ccMask,
+         const MachineOperand *target)
+    : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+};
+} // end namespace SystemZII
 
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
@@ -133,78 +133,63 @@ public:
   explicit SystemZInstrInfo(SystemZTargetMachine &TM);
 
   // Override TargetInstrInfo.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const LLVM_OVERRIDE;
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const LLVM_OVERRIDE;
-  virtual bool isStackSlotCopy(const MachineInstr *MI, int &DestFrameIndex,
-                               int &SrcFrameIndex) const LLVM_OVERRIDE;
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,
-                             MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const LLVM_OVERRIDE;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const LLVM_OVERRIDE;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const LLVM_OVERRIDE;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+  bool isStackSlotCopy(const MachineInstr *MI, int &DestFrameIndex,
+                       int &SrcFrameIndex) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
   bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &Mask, int &Value) const
-    LLVM_OVERRIDE;
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
   bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int Mask, int Value,
-                            const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
-  virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
-                                   unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const
-    LLVM_OVERRIDE;
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumCyclesT,
-                                   unsigned ExtraPredCyclesT,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumCyclesF,
-                                   unsigned ExtraPredCyclesF,
-                                   const BranchProbability &Probability) const
-    LLVM_OVERRIDE;
-  virtual bool
-    PredicateInstruction(MachineInstr *MI,
-                         const SmallVectorImpl<MachineOperand> &Pred) const
-    LLVM_OVERRIDE;
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const LLVM_OVERRIDE;
-  virtual void
-    storeRegToStackSlot(MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator MBBI,
-                        unsigned SrcReg, bool isKill, int FrameIndex,
-                        const TargetRegisterClass *RC,
-                        const TargetRegisterInfo *TRI) const LLVM_OVERRIDE;
-  virtual void
-    loadRegFromStackSlot(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI,
-                         unsigned DestReg, int FrameIdx,
-                         const TargetRegisterClass *RC,
-                         const TargetRegisterInfo *TRI) const LLVM_OVERRIDE;
-  virtual MachineInstr *
-    convertToThreeAddress(MachineFunction::iterator &MFI,
-                          MachineBasicBlock::iterator &MBBI,
-                          LiveVariables *LV) const;
-  virtual MachineInstr *
-    foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                          const SmallVectorImpl<unsigned> &Ops,
-                          int FrameIndex) const;
-  virtual MachineInstr *
-    foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
-                          const SmallVectorImpl<unsigned> &Ops,
-                          MachineInstr* LoadMI) const;
-  virtual bool
-    expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const LLVM_OVERRIDE;
-  virtual bool
-    ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
-    LLVM_OVERRIDE;
+                            const MachineRegisterInfo *MRI) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+                           const BranchProbability &Probability) const override;
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const
+    override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const override;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
+    override;
 
   // Return the SystemZRegisterInfo, which this class owns.
   const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 6524e44..e70df92 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -595,22 +595,28 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
-    def LPR  : UnaryRR <"lp",  0x10,   z_iabs32, GR32, GR32>;
-    def LPGR : UnaryRRE<"lpg", 0xB900, z_iabs64, GR64, GR64>;
+    def LPR  : UnaryRR <"lp",  0x10,   z_iabs, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpg", 0xB900, z_iabs, GR64, GR64>;
   }
   let CCValues = 0xE, CompareZeroCCMask = 0xE in
     def LPGFR : UnaryRRE<"lpgf", 0xB910, null_frag, GR64, GR32>;
 }
+def : Pat<(z_iabs32 GR32:$src), (LPR  GR32:$src)>;
+def : Pat<(z_iabs64 GR64:$src), (LPGR GR64:$src)>;
+defm : SXU<z_iabs,   LPGFR>;
 defm : SXU<z_iabs64, LPGFR>;
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
-    def LNR  : UnaryRR <"ln",  0x11,   z_inegabs32, GR32, GR32>;
-    def LNGR : UnaryRRE<"lng", 0xB901, z_inegabs64, GR64, GR64>;
+    def LNR  : UnaryRR <"ln",  0x11,   z_inegabs, GR32, GR32>;
+    def LNGR : UnaryRRE<"lng", 0xB901, z_inegabs, GR64, GR64>;
   }
   let CCValues = 0xE, CompareZeroCCMask = 0xE in
     def LNGFR : UnaryRRE<"lngf", 0xB911, null_frag, GR64, GR32>;
 }
+def : Pat<(z_inegabs32 GR32:$src), (LNR  GR32:$src)>;
+def : Pat<(z_inegabs64 GR64:$src), (LNGR GR64:$src)>;
+defm : SXU<z_inegabs,   LNGFR>;
 defm : SXU<z_inegabs64, LNGFR>;
 
 let Defs = [CC] in {
@@ -753,7 +759,7 @@ let Defs = [CC], Uses = [CC] in {
 // Subtraction
 //===----------------------------------------------------------------------===//
 
-// Plain substraction.  Although immediate forms exist, we use the
+// Plain subtraction.  Although immediate forms exist, we use the
 // add-immediate instruction instead.
 let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Subtraction of a register.
@@ -1043,15 +1049,15 @@ let Defs = [CC] in {
 
 // Forms of RISBG that only affect one word of the destination register.
 // They do not set CC.
-def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>, Requires<[FeatureHighWord]>;
-def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>,  Requires<[FeatureHighWord]>;
-def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>, Requires<[FeatureHighWord]>;
-def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>,  Requires<[FeatureHighWord]>;
-def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>, Requires<[FeatureHighWord]>;
-def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>,
-              Requires<[FeatureHighWord]>;
-def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>,
-              Requires<[FeatureHighWord]>;
+let Predicates = [FeatureHighWord] in {
+  def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>;
+  def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>;
+  def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>;
+  def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>;
+  def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>;
+  def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>;
+  def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>;
+}
 
 // Rotate second operand left and perform a logical operation with selected
 // bits of the first operand.  The CC result only describes the selected bits,
@@ -1195,58 +1201,89 @@ def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
-def ATOMIC_SWAPW        : AtomicLoadWBinaryReg<z_atomic_swapw>;
-def ATOMIC_SWAP_32      : AtomicLoadBinaryReg32<atomic_swap_32>;
-def ATOMIC_SWAP_64      : AtomicLoadBinaryReg64<atomic_swap_64>;
-
-def ATOMIC_LOADW_AR     : AtomicLoadWBinaryReg<z_atomic_loadw_add>;
-def ATOMIC_LOADW_AFI    : AtomicLoadWBinaryImm<z_atomic_loadw_add, simm32>;
-def ATOMIC_LOAD_AR      : AtomicLoadBinaryReg32<atomic_load_add_32>;
-def ATOMIC_LOAD_AHI     : AtomicLoadBinaryImm32<atomic_load_add_32, imm32sx16>;
-def ATOMIC_LOAD_AFI     : AtomicLoadBinaryImm32<atomic_load_add_32, simm32>;
-def ATOMIC_LOAD_AGR     : AtomicLoadBinaryReg64<atomic_load_add_64>;
-def ATOMIC_LOAD_AGHI    : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx16>;
-def ATOMIC_LOAD_AGFI    : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx32>;
-
-def ATOMIC_LOADW_SR     : AtomicLoadWBinaryReg<z_atomic_loadw_sub>;
-def ATOMIC_LOAD_SR      : AtomicLoadBinaryReg32<atomic_load_sub_32>;
-def ATOMIC_LOAD_SGR     : AtomicLoadBinaryReg64<atomic_load_sub_64>;
-
-def ATOMIC_LOADW_NR     : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
-def ATOMIC_LOADW_NILH   : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
-def ATOMIC_LOAD_NR      : AtomicLoadBinaryReg32<atomic_load_and_32>;
-def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
-def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
-def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
-def ATOMIC_LOAD_NGR     : AtomicLoadBinaryReg64<atomic_load_and_64>;
-def ATOMIC_LOAD_NILL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
-def ATOMIC_LOAD_NILH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
-def ATOMIC_LOAD_NIHL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
-def ATOMIC_LOAD_NIHH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
-def ATOMIC_LOAD_NILF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
-def ATOMIC_LOAD_NIHF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
+def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+
+let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
+  def LAA   : LoadAndOpRSY<"laa",   0xEBF8, atomic_load_add_32, GR32>;
+  def LAAG  : LoadAndOpRSY<"laag",  0xEBE8, atomic_load_add_64, GR64>;
+  def LAAL  : LoadAndOpRSY<"laal",  0xEBFA, null_frag, GR32>;
+  def LAALG : LoadAndOpRSY<"laalg", 0xEBEA, null_frag, GR64>;
+  def LAN   : LoadAndOpRSY<"lan",   0xEBF4, atomic_load_and_32, GR32>;
+  def LANG  : LoadAndOpRSY<"lang",  0xEBE4, atomic_load_and_64, GR64>;
+  def LAO   : LoadAndOpRSY<"lao",   0xEBF6, atomic_load_or_32, GR32>;
+  def LAOG  : LoadAndOpRSY<"laog",  0xEBE6, atomic_load_or_64, GR64>;
+  def LAX   : LoadAndOpRSY<"lax",   0xEBF7, atomic_load_xor_32, GR32>;
+  def LAXG  : LoadAndOpRSY<"laxg",  0xEBE7, atomic_load_xor_64, GR64>;
+}
+
+def ATOMIC_SWAPW   : AtomicLoadWBinaryReg<z_atomic_swapw>;
+def ATOMIC_SWAP_32 : AtomicLoadBinaryReg32<atomic_swap_32>;
+def ATOMIC_SWAP_64 : AtomicLoadBinaryReg64<atomic_swap_64>;
+
+def ATOMIC_LOADW_AR  : AtomicLoadWBinaryReg<z_atomic_loadw_add>;
+def ATOMIC_LOADW_AFI : AtomicLoadWBinaryImm<z_atomic_loadw_add, simm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_AR   : AtomicLoadBinaryReg32<atomic_load_add_32>;
+  def ATOMIC_LOAD_AHI  : AtomicLoadBinaryImm32<atomic_load_add_32, imm32sx16>;
+  def ATOMIC_LOAD_AFI  : AtomicLoadBinaryImm32<atomic_load_add_32, simm32>;
+  def ATOMIC_LOAD_AGR  : AtomicLoadBinaryReg64<atomic_load_add_64>;
+  def ATOMIC_LOAD_AGHI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx16>;
+  def ATOMIC_LOAD_AGFI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx32>;
+}
+
+def ATOMIC_LOADW_SR : AtomicLoadWBinaryReg<z_atomic_loadw_sub>;
+def ATOMIC_LOAD_SR  : AtomicLoadBinaryReg32<atomic_load_sub_32>;
+def ATOMIC_LOAD_SGR : AtomicLoadBinaryReg64<atomic_load_sub_64>;
+
+def ATOMIC_LOADW_NR   : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
+def ATOMIC_LOADW_NILH : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_NR     : AtomicLoadBinaryReg32<atomic_load_and_32>;
+  def ATOMIC_LOAD_NILL   : AtomicLoadBinaryImm32<atomic_load_and_32,
+                                                 imm32ll16c>;
+  def ATOMIC_LOAD_NILH   : AtomicLoadBinaryImm32<atomic_load_and_32,
+                                                 imm32lh16c>;
+  def ATOMIC_LOAD_NILF   : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+  def ATOMIC_LOAD_NGR    : AtomicLoadBinaryReg64<atomic_load_and_64>;
+  def ATOMIC_LOAD_NILL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64ll16c>;
+  def ATOMIC_LOAD_NILH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64lh16c>;
+  def ATOMIC_LOAD_NIHL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hl16c>;
+  def ATOMIC_LOAD_NIHH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hh16c>;
+  def ATOMIC_LOAD_NILF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64lf32c>;
+  def ATOMIC_LOAD_NIHF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hf32c>;
+}
 
 def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
 def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
-def ATOMIC_LOAD_OR      : AtomicLoadBinaryReg32<atomic_load_or_32>;
-def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
-def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
-def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
-def ATOMIC_LOAD_OGR     : AtomicLoadBinaryReg64<atomic_load_or_64>;
-def ATOMIC_LOAD_OILL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
-def ATOMIC_LOAD_OILH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
-def ATOMIC_LOAD_OIHL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
-def ATOMIC_LOAD_OIHH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
-def ATOMIC_LOAD_OILF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
-def ATOMIC_LOAD_OIHF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_OR     : AtomicLoadBinaryReg32<atomic_load_or_32>;
+  def ATOMIC_LOAD_OILL   : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+  def ATOMIC_LOAD_OILH   : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+  def ATOMIC_LOAD_OILF   : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+  def ATOMIC_LOAD_OGR    : AtomicLoadBinaryReg64<atomic_load_or_64>;
+  def ATOMIC_LOAD_OILL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+  def ATOMIC_LOAD_OILH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+  def ATOMIC_LOAD_OIHL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+  def ATOMIC_LOAD_OIHH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+  def ATOMIC_LOAD_OILF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+  def ATOMIC_LOAD_OIHF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+}
 
 def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
 def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
-def ATOMIC_LOAD_XR      : AtomicLoadBinaryReg32<atomic_load_xor_32>;
-def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
-def ATOMIC_LOAD_XGR     : AtomicLoadBinaryReg64<atomic_load_xor_64>;
-def ATOMIC_LOAD_XILF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
-def ATOMIC_LOAD_XIHF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_XR     : AtomicLoadBinaryReg32<atomic_load_xor_32>;
+  def ATOMIC_LOAD_XILF   : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+  def ATOMIC_LOAD_XGR    : AtomicLoadBinaryReg64<atomic_load_xor_64>;
+  def ATOMIC_LOAD_XILF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+  def ATOMIC_LOAD_XIHF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+}
 
 def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
 def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index ba027d4..1b88d06 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -71,99 +71,99 @@ using namespace llvm;
 STATISTIC(LongBranches, "Number of long branches.");
 
 namespace {
-  // Represents positional information about a basic block.
-  struct MBBInfo {
-    // The address that we currently assume the block has.
-    uint64_t Address;
-
-    // The size of the block in bytes, excluding terminators.
-    // This value never changes.
-    uint64_t Size;
-
-    // The minimum alignment of the block, as a log2 value.
-    // This value never changes.
-    unsigned Alignment;
-
-    // The number of terminators in this block.  This value never changes.
-    unsigned NumTerminators;
-
-    MBBInfo()
-      : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
-  };
-
-  // Represents the state of a block terminator.
-  struct TerminatorInfo {
-    // If this terminator is a relaxable branch, this points to the branch
-    // instruction, otherwise it is null.
-    MachineInstr *Branch;
-
-    // The address that we currently assume the terminator has.
-    uint64_t Address;
-
-    // The current size of the terminator in bytes.
-    uint64_t Size;
-
-    // If Branch is nonnull, this is the number of the target block,
-    // otherwise it is unused.
-    unsigned TargetBlock;
-
-    // If Branch is nonnull, this is the length of the longest relaxed form,
-    // otherwise it is zero.
-    unsigned ExtraRelaxSize;
-
-    TerminatorInfo() : Branch(0), Size(0), TargetBlock(0), ExtraRelaxSize(0) {}
-  };
-
-  // Used to keep track of the current position while iterating over the blocks.
-  struct BlockPosition {
-    // The address that we assume this position has.
-    uint64_t Address;
-
-    // The number of low bits in Address that are known to be the same
-    // as the runtime address.
-    unsigned KnownBits;
-
-    BlockPosition(unsigned InitialAlignment)
-      : Address(0), KnownBits(InitialAlignment) {}
-  };
-
-  class SystemZLongBranch : public MachineFunctionPass {
-  public:
-    static char ID;
-    SystemZLongBranch(const SystemZTargetMachine &tm)
-      : MachineFunctionPass(ID), TII(0) {}
-
-    virtual const char *getPassName() const {
-      return "SystemZ Long Branch";
-    }
+// Represents positional information about a basic block.
+struct MBBInfo {
+  // The address that we currently assume the block has.
+  uint64_t Address;
+
+  // The size of the block in bytes, excluding terminators.
+  // This value never changes.
+  uint64_t Size;
+
+  // The minimum alignment of the block, as a log2 value.
+  // This value never changes.
+  unsigned Alignment;
+
+  // The number of terminators in this block.  This value never changes.
+  unsigned NumTerminators;
+
+  MBBInfo()
+    : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+};
+
+// Represents the state of a block terminator.
+struct TerminatorInfo {
+  // If this terminator is a relaxable branch, this points to the branch
+  // instruction, otherwise it is null.
+  MachineInstr *Branch;
+
+  // The address that we currently assume the terminator has.
+  uint64_t Address;
+
+  // The current size of the terminator in bytes.
+  uint64_t Size;
+
+  // If Branch is nonnull, this is the number of the target block,
+  // otherwise it is unused.
+  unsigned TargetBlock;
+
+  // If Branch is nonnull, this is the length of the longest relaxed form,
+  // otherwise it is zero.
+  unsigned ExtraRelaxSize;
+
+  TerminatorInfo() : Branch(0), Size(0), TargetBlock(0), ExtraRelaxSize(0) {}
+};
+
+// Used to keep track of the current position while iterating over the blocks.
+struct BlockPosition {
+  // The address that we assume this position has.
+  uint64_t Address;
+
+  // The number of low bits in Address that are known to be the same
+  // as the runtime address.
+  unsigned KnownBits;
+
+  BlockPosition(unsigned InitialAlignment)
+    : Address(0), KnownBits(InitialAlignment) {}
+};
+
+class SystemZLongBranch : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZLongBranch(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(0) {}
+
+  const char *getPassName() const override {
+    return "SystemZ Long Branch";
+  }
 
-    bool runOnMachineFunction(MachineFunction &F);
-
-  private:
-    void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
-    void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
-                        bool AssumeRelaxed);
-    TerminatorInfo describeTerminator(MachineInstr *MI);
-    uint64_t initMBBInfo();
-    bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
-    bool mustRelaxABranch();
-    void setWorstCaseAddresses();
-    void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
-    void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
-    void relaxBranch(TerminatorInfo &Terminator);
-    void relaxBranches();
-
-    const SystemZInstrInfo *TII;
-    MachineFunction *MF;
-    SmallVector<MBBInfo, 16> MBBs;
-    SmallVector<TerminatorInfo, 16> Terminators;
-  };
-
-  char SystemZLongBranch::ID = 0;
-
-  const uint64_t MaxBackwardRange = 0x10000;
-  const uint64_t MaxForwardRange = 0xfffe;
-} // end of anonymous namespace
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+  void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
+  void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
+                      bool AssumeRelaxed);
+  TerminatorInfo describeTerminator(MachineInstr *MI);
+  uint64_t initMBBInfo();
+  bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
+  bool mustRelaxABranch();
+  void setWorstCaseAddresses();
+  void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
+  void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
+  void relaxBranch(TerminatorInfo &Terminator);
+  void relaxBranches();
+
+  const SystemZInstrInfo *TII;
+  MachineFunction *MF;
+  SmallVector<MBBInfo, 16> MBBs;
+  SmallVector<TerminatorInfo, 16> Terminators;
+};
+
+char SystemZLongBranch::ID = 0;
+
+const uint64_t MaxBackwardRange = 0x10000;
+const uint64_t MaxForwardRange = 0xfffe;
+} // end anonymous namespace
 
 FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
   return new SystemZLongBranch(TM);
@@ -321,9 +321,8 @@ bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
 // Return true if, under current assumptions, any terminator needs
 // to be relaxed.
 bool SystemZLongBranch::mustRelaxABranch() {
-  for (SmallVectorImpl<TerminatorInfo>::iterator TI = Terminators.begin(),
-         TE = Terminators.end(); TI != TE; ++TI)
-    if (mustRelaxBranch(*TI, TI->Address))
+  for (auto &Terminator : Terminators)
+    if (mustRelaxBranch(Terminator, Terminator.Address))
       return true;
   return false;
 }
@@ -333,10 +332,9 @@ bool SystemZLongBranch::mustRelaxABranch() {
 void SystemZLongBranch::setWorstCaseAddresses() {
   SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
   BlockPosition Position(MF->getAlignment());
-  for (SmallVectorImpl<MBBInfo>::iterator BI = MBBs.begin(), BE = MBBs.end();
-       BI != BE; ++BI) {
-    skipNonTerminators(Position, *BI);
-    for (unsigned BTI = 0, BTE = BI->NumTerminators; BTI != BTE; ++BTI) {
+  for (auto &Block : MBBs) {
+    skipNonTerminators(Position, Block);
+    for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
       skipTerminator(Position, *TI, true);
       ++TI;
     }
@@ -435,10 +433,9 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
 void SystemZLongBranch::relaxBranches() {
   SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
   BlockPosition Position(MF->getAlignment());
-  for (SmallVectorImpl<MBBInfo>::iterator BI = MBBs.begin(), BE = MBBs.end();
-       BI != BE; ++BI) {
-    skipNonTerminators(Position, *BI);
-    for (unsigned BTI = 0, BTE = BI->NumTerminators; BTI != BTE; ++BTI) {
+  for (auto &Block : MBBs) {
+    skipNonTerminators(Position, Block);
+    for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
       assert(Position.Address <= TI->Address &&
              "Addresses shouldn't go forwards");
       if (mustRelaxBranch(*TI, Position.Address))
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index ff9a6c0..df561e2 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -9,9 +9,9 @@
 
 #include "SystemZMCInstLower.h"
 #include "SystemZAsmPrinter.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
index f6d5ac8..90447ff 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -11,8 +11,8 @@
 #define LLVM_SYSTEMZMCINSTLOWER_H
 
 #include "llvm/MC/MCExpr.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 class MCInst;
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 845291f..50865f1 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -63,6 +63,6 @@ public:
   void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
 };
 
-} // end llvm namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 31cabaa..a391961 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -103,6 +103,7 @@ def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
+def z_iabs              : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
 def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
 def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
 def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
@@ -119,6 +120,9 @@ def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
 def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
 def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
 
+def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
+                                 [SDNPHasChain, SDNPMayStore]>;
+
 class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
   : SDNode<"SystemZISD::"##name, profile,
            [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
@@ -247,7 +251,7 @@ def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr), [{
-  LoadSDNode *Load = cast<LoadSDNode>(N);
+  auto *Load = cast<LoadSDNode>(N);
   return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
 }]>;
 def aligned_load         : AlignedLoad<load>;
@@ -259,7 +263,7 @@ def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
   : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
-  StoreSDNode *Store = cast<StoreSDNode>(N);
+  auto *Store = cast<StoreSDNode>(N);
   return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
 }]>;
 def aligned_store         : AlignedStore<store>;
@@ -270,7 +274,7 @@ def aligned_truncstorei32 : AlignedStore<truncstorei32>;
 // location multiple times.
 class NonvolatileLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr), [{
-  LoadSDNode *Load = cast<LoadSDNode>(N);
+  auto *Load = cast<LoadSDNode>(N);
   return !Load->isVolatile();
 }]>;
 def nonvolatile_load          : NonvolatileLoad<load>;
@@ -281,7 +285,7 @@ def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
 // Non-volatile stores.
 class NonvolatileStore<SDPatternOperator store>
   : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
-  StoreSDNode *Store = cast<StoreSDNode>(N);
+  auto *Store = cast<StoreSDNode>(N);
   return !Store->isVolatile();
 }]>;
 def nonvolatile_store         : NonvolatileStore<store>;
@@ -346,6 +350,9 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
                                    APInt::getLowBitsSet(BitWidth, 8));
 }]>;
 
+// Negative integer absolute.
+def z_inegabs : PatFrag<(ops node:$src), (ineg (z_iabs node:$src))>;
+
 // Integer absolute, matching the canonical form generated by DAGCombiner.
 def z_iabs32 : PatFrag<(ops node:$src),
                        (xor (add node:$src, (sra node:$src, (i32 31))),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 7706351..c0f94ec 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -148,5 +148,8 @@ multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
 // Record that INSN is a LOAD AND TEST that can be used to compare
 // registers in CLS against zero.  The instruction has separate R1 and R2
 // operands, but they must be the same when the instruction is used like this.
-class CompareZeroFP<Instruction insn, RegisterOperand cls>
-  : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+multiclass CompareZeroFP<Instruction insn, RegisterOperand cls> {
+  def : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+  // The sign of the zero makes no difference.
+  def : Pat<(z_fcmp cls:$reg, (fpimmneg0)), (insn cls:$reg, cls:$reg)>;
+}
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index f241fb0..e6b58f1 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -16,6 +16,9 @@ class SystemZFeature<string extname, string intname, string desc>
     AssemblerPredicate<"Feature"##intname, extname>,
     SubtargetFeature<extname, "Has"##intname, "true", desc>;
 
+class SystemZMissingFeature<string intname>
+  : Predicate<"!Subtarget.has"##intname##"()">;
+
 def FeatureDistinctOps : SystemZFeature<
   "distinct-ops", "DistinctOps",
   "Assume that the distinct-operands facility is installed"
@@ -36,11 +39,24 @@ def FeatureFPExtension : SystemZFeature<
   "Assume that the floating-point extension facility is installed"
 >;
 
+def FeatureFastSerialization : SystemZFeature<
+  "fast-serialization", "FastSerialization",
+  "Assume that the fast-serialization facility is installed"
+>;
+
+def FeatureInterlockedAccess1 : SystemZFeature<
+  "interlocked-access1", "InterlockedAccess1",
+  "Assume that interlocked-access facility 1 is installed"
+>;
+def FeatureNoInterlockedAccess1 : SystemZMissingFeature<"InterlockedAccess1">;
+
 def : Processor<"generic", NoItineraries, []>;
 def : Processor<"z10", NoItineraries, []>;
 def : Processor<"z196", NoItineraries,
                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
-                 FeatureFPExtension]>;
+                 FeatureFPExtension, FeatureFastSerialization,
+                 FeatureInterlockedAccess1]>;
 def : Processor<"zEC12", NoItineraries,
                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
-                 FeatureFPExtension]>;
+                 FeatureFPExtension, FeatureFastSerialization,
+                 FeatureInterlockedAccess1]>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index b61ae88..1ac4e32 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -63,8 +63,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
-  const SystemZInstrInfo &TII =
-    *static_cast<const SystemZInstrInfo*>(TM.getInstrInfo());
+  auto *TII = static_cast<const SystemZInstrInfo*>(TM.getInstrInfo());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -84,7 +83,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   // See if the offset is in range, or if an equivalent instruction that
   // accepts the offset exists.
   unsigned Opcode = MI->getOpcode();
-  unsigned OpcodeForOffset = TII.getOpcodeForOffset(Opcode, Offset);
+  unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
   if (OpcodeForOffset)
     MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
   else {
@@ -94,7 +93,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     int64_t Mask = 0xffff;
     do {
       Offset = OldOffset & Mask;
-      OpcodeForOffset = TII.getOpcodeForOffset(Opcode, Offset);
+      OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
       Mask >>= 1;
       assert(Mask && "One offset must be OK");
     } while (!OpcodeForOffset);
@@ -107,21 +106,21 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         && MI->getOperand(FIOperandNum + 2).getReg() == 0) {
       // Load the offset into the scratch register and use it as an index.
       // The scratch register then dies here.
-      TII.loadImmediate(MBB, MI, ScratchReg, HighOffset);
+      TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
       MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
       MI->getOperand(FIOperandNum + 2).ChangeToRegister(ScratchReg,
                                                         false, false, true);
     } else {
       // Load the anchor address into a scratch register.
-      unsigned LAOpcode = TII.getOpcodeForOffset(SystemZ::LA, HighOffset);
+      unsigned LAOpcode = TII->getOpcodeForOffset(SystemZ::LA, HighOffset);
       if (LAOpcode)
-        BuildMI(MBB, MI, DL, TII.get(LAOpcode),ScratchReg)
+        BuildMI(MBB, MI, DL, TII->get(LAOpcode),ScratchReg)
           .addReg(BasePtr).addImm(HighOffset).addReg(0);
       else {
         // Load the high offset into the scratch register and use it as
         // an index.
-        TII.loadImmediate(MBB, MI, ScratchReg, HighOffset);
-        BuildMI(MBB, MI, DL, TII.get(SystemZ::AGR),ScratchReg)
+        TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+        BuildMI(MBB, MI, DL, TII->get(SystemZ::AGR),ScratchReg)
           .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
       }
 
@@ -130,7 +129,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                                     false, false, true);
     }
   }
-  MI->setDesc(TII.get(OpcodeForOffset));
+  MI->setDesc(TII->get(OpcodeForOffset));
   MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 13f45fa..4ad8048 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -19,15 +19,15 @@
 namespace llvm {
 
 namespace SystemZ {
-  // Return the subreg to use for referring to the even and odd registers
-  // in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
-  inline unsigned even128(bool Is32bit) {
-    return Is32bit ? subreg_hl32 : subreg_h64;
-  }
-  inline unsigned odd128(bool Is32bit) {
-    return Is32bit ? subreg_l32 : subreg_l64;
-  }
+// Return the subreg to use for referring to the even and odd registers
+// in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
+inline unsigned even128(bool Is32bit) {
+  return Is32bit ? subreg_hl32 : subreg_h64;
+}
+inline unsigned odd128(bool Is32bit) {
+  return Is32bit ? subreg_l32 : subreg_l64;
 }
+} // end namespace SystemZ
 
 class SystemZSubtarget;
 class SystemZInstrInfo;
@@ -40,27 +40,22 @@ public:
   SystemZRegisterInfo(SystemZTargetMachine &tm);
 
   // Override TargetRegisterInfo.h.
-  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
-    LLVM_OVERRIDE {
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
-  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const
-    LLVM_OVERRIDE {
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
     return true;
   }
-  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const
-    LLVM_OVERRIDE {
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
-  virtual const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0)
-    const LLVM_OVERRIDE;
-  virtual BitVector getReservedRegs(const MachineFunction &MF)
-    const LLVM_OVERRIDE;
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                   int SPAdj, unsigned FIOperandNum,
-                                   RegScavenger *RS) const LLVM_OVERRIDE;
-  virtual unsigned getFrameRegister(const MachineFunction &MF) const
-    LLVM_OVERRIDE;
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const
+    override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index c7ebb5d..7635bdc 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -62,7 +62,7 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   if (IsVolatile)
     return SDValue();
 
-  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size))
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
     return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
                       Chain, Dst, Src, CSize->getZExtValue());
   return SDValue();
@@ -93,11 +93,11 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   if (IsVolatile)
     return SDValue();
 
-  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
     uint64_t Bytes = CSize->getZExtValue();
     if (Bytes == 0)
       return SDValue();
-    if (ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+    if (auto *CByte = dyn_cast<ConstantSDNode>(Byte)) {
       // Handle cases that can be done using at most two of
       // MVI, MVHI, MVHHI and MVGHI.  The latter two can only be
       // used if ByteVal is all zeros or all ones; in other casees,
@@ -137,7 +137,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
     assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
 
     // Handle the special case of a memset of 0, which can use XC.
-    ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte);
+    auto *CByte = dyn_cast<ConstantSDNode>(Byte);
     if (CByte && CByte->getZExtValue() == 0)
       return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
                         Chain, Dst, Dst, Bytes);
@@ -194,7 +194,7 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                         SDValue Src1, SDValue Src2, SDValue Size,
                         MachinePointerInfo Op1PtrInfo,
                         MachinePointerInfo Op2PtrInfo) const {
-  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
     uint64_t Bytes = CSize->getZExtValue();
     assert(Bytes > 0 && "Caller should have handled 0-size case");
     Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 281d1e2..79e7fab 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -25,56 +25,53 @@ public:
   explicit SystemZSelectionDAGInfo(const SystemZTargetMachine &TM);
   ~SystemZSelectionDAGInfo();
 
-  virtual
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool IsVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const
-    LLVM_OVERRIDE;
+                                  MachinePointerInfo SrcPtrInfo) const override;
 
-  virtual SDValue
-  EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
-                          SDValue Chain, SDValue Dst, SDValue Byte,
-                          SDValue Size, unsigned Align, bool IsVolatile,
-                          MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
+                                  SDValue Chain, SDValue Dst, SDValue Byte,
+                                  SDValue Size, unsigned Align, bool IsVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
 
-  virtual std::pair<SDValue, SDValue>
+  std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                           SDValue Src1, SDValue Src2, SDValue Size,
                           MachinePointerInfo Op1PtrInfo,
-                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+                          MachinePointerInfo Op2PtrInfo) const override;
 
-  virtual std::pair<SDValue, SDValue>
+  std::pair<SDValue, SDValue>
   EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                           SDValue Src, SDValue Char, SDValue Length,
-                          MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
+                          MachinePointerInfo SrcPtrInfo) const override;
 
-  virtual std::pair<SDValue, SDValue>
+  std::pair<SDValue, SDValue>
   EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                           SDValue Dest, SDValue Src,
                           MachinePointerInfo DestPtrInfo,
                           MachinePointerInfo SrcPtrInfo,
-                          bool isStpcpy) const LLVM_OVERRIDE;
+                          bool isStpcpy) const override;
 
-  virtual std::pair<SDValue, SDValue>
+  std::pair<SDValue, SDValue>
   EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                           SDValue Src1, SDValue Src2,
                           MachinePointerInfo Op1PtrInfo,
-                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+                          MachinePointerInfo Op2PtrInfo) const override;
 
-  virtual std::pair<SDValue, SDValue>
+  std::pair<SDValue, SDValue>
   EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
-                          SDValue Src, MachinePointerInfo SrcPtrInfo) const
-    LLVM_OVERRIDE;
+                          SDValue Src,
+                          MachinePointerInfo SrcPtrInfo) const override;
 
-  virtual std::pair<SDValue, SDValue>
+  std::pair<SDValue, SDValue>
   EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                            SDValue Src, SDValue MaxLength,
-                           MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
+                           MachinePointerInfo SrcPtrInfo) const override;
 };
 
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 537a545..9350779 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -21,32 +21,32 @@
 using namespace llvm;
 
 namespace {
-  class SystemZShortenInst : public MachineFunctionPass {
-  public:
-    static char ID;
-    SystemZShortenInst(const SystemZTargetMachine &tm);
+class SystemZShortenInst : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZShortenInst(const SystemZTargetMachine &tm);
 
-    virtual const char *getPassName() const {
-      return "SystemZ Instruction Shortening";
-    }
+  const char *getPassName() const override {
+    return "SystemZ Instruction Shortening";
+  }
 
-    bool processBlock(MachineBasicBlock *MBB);
-    bool runOnMachineFunction(MachineFunction &F);
+  bool processBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F);
 
-  private:
-    bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
-                    unsigned LLIxL, unsigned LLIxH);
+private:
+  bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
+                  unsigned LLIxL, unsigned LLIxH);
 
-    const SystemZInstrInfo *TII;
+  const SystemZInstrInfo *TII;
 
-    // LowGPRs[I] has bit N set if LLVM register I includes the low
-    // word of GPR N.  HighGPRs is the same for the high word.
-    unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
-    unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
-  };
+  // LowGPRs[I] has bit N set if LLVM register I includes the low
+  // word of GPR N.  HighGPRs is the same for the high word.
+  unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
+  unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+};
 
-  char SystemZShortenInst::ID = 0;
-} // end of anonymous namespace
+char SystemZShortenInst::ID = 0;
+} // end anonymous namespace
 
 FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
   return new SystemZShortenInst(TM);
@@ -98,16 +98,15 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
 }
 
 // Process all instructions in MBB.  Return true if something changed.
-bool SystemZShortenInst::processBlock(MachineBasicBlock *MBB) {
+bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
   // Work out which words are live on exit from the block.
   unsigned LiveLow = 0;
   unsigned LiveHigh = 0;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
-           LE = (*SI)->livein_end(); LI != LE; ++LI) {
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) {
+    for (auto LI = (*SI)->livein_begin(), LE = (*SI)->livein_end();
+         LI != LE; ++LI) {
       unsigned Reg = *LI;
       assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
       LiveLow |= LowGPRs[Reg];
@@ -116,8 +115,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock *MBB) {
   }
 
   // Iterate backwards through the block looking for instructions to change.
-  for (MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin(),
-         MBBE = MBB->rend(); MBBI != MBBE; ++MBBI) {
+  for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
     MachineInstr &MI = *MBBI;
     unsigned Opcode = MI.getOpcode();
     if (Opcode == SystemZ::IILF)
@@ -128,8 +126,8 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock *MBB) {
                             SystemZ::LLIHH);
     unsigned UsedLow = 0;
     unsigned UsedHigh = 0;
-    for (MachineInstr::mop_iterator MOI = MI.operands_begin(),
-           MOE = MI.operands_end(); MOI != MOE; ++MOI) {
+    for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
+         MOI != MOE; ++MOI) {
       MachineOperand &MO = *MOI;
       if (MO.isReg()) {
         if (unsigned Reg = MO.getReg()) {
@@ -155,9 +153,8 @@ bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
   TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
 
   bool Changed = false;
-  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
-       MFI != MFE; ++MFI)
-    Changed |= processBlock(MFI);
+  for (auto &MBB : F)
+    Changed |= processBlock(MBB);
 
   return Changed;
 }
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 3971d5e..33d7e06 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZSubtarget.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Host.h"
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -26,6 +26,7 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT,
                                    const std::string &FS)
   : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
     HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+    HasFastSerialization(false), HasInterlockedAccess1(false),
     TargetTriple(TT) {
   std::string CPUName = CPU;
   if (CPUName.empty())
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 5817491..ffca2d8 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -32,6 +32,8 @@ protected:
   bool HasLoadStoreOnCond;
   bool HasHighWord;
   bool HasFPExtension;
+  bool HasFastSerialization;
+  bool HasInterlockedAccess1;
 
 private:
   Triple TargetTriple;
@@ -41,7 +43,7 @@ public:
                    const std::string &FS);
 
   // This is important for reducing register pressure in vector code.
-  virtual bool useAA() const LLVM_OVERRIDE { return true; }
+  bool useAA() const override { return true; }
 
   // Automatically generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -58,6 +60,12 @@ public:
   // Return true if the target has the floating-point extension facility.
   bool hasFPExtension() const { return HasFPExtension; }
 
+  // Return true if the target has the fast-serialization facility.
+  bool hasFastSerialization() const { return HasFastSerialization; }
+
+  // Return true if the target has interlocked-access facility 1.
+  bool hasInterlockedAccess1() const { return HasInterlockedAccess1; }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index dee92e9..4c9ce29 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -30,8 +30,7 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT,
     // Make sure that global data has at least 16 bits of alignment by default,
     // so that we can refer to it using LARL.  We don't have any special
     // requirements for stack variables though.
-    DL("E-p:64:64:64-i1:8:16-i8:8:16-i16:16-i32:32-i64:64"
-       "-f32:32-f64:64-f128:64-a0:8:16-n32:64"),
+    DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
     FrameLowering(*this, Subtarget) {
   initAsmInfo();
@@ -48,10 +47,10 @@ public:
     return getTM<SystemZTargetMachine>();
   }
 
-  virtual void addIRPasses() LLVM_OVERRIDE;
-  virtual bool addInstSelector() LLVM_OVERRIDE;
-  virtual bool addPreSched2() LLVM_OVERRIDE;
-  virtual bool addPreEmitPass() LLVM_OVERRIDE;
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // end anonymous namespace
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index a99a98e..1db717b 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -19,8 +19,8 @@
 #include "SystemZISelLowering.h"
 #include "SystemZInstrInfo.h"
 #include "SystemZRegisterInfo.h"
-#include "SystemZSubtarget.h"
 #include "SystemZSelectionDAGInfo.h"
+#include "SystemZSubtarget.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -42,31 +42,30 @@ public:
                        CodeGenOpt::Level OL);
 
   // Override TargetMachine.
-  virtual const TargetFrameLowering *getFrameLowering() const LLVM_OVERRIDE {
+  const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const SystemZInstrInfo *getInstrInfo() const LLVM_OVERRIDE {
+  const SystemZInstrInfo *getInstrInfo() const override {
     return &InstrInfo;
   }
-  virtual const SystemZSubtarget *getSubtargetImpl() const LLVM_OVERRIDE {
+  const SystemZSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  virtual const DataLayout *getDataLayout() const LLVM_OVERRIDE {
+  const DataLayout *getDataLayout() const override {
     return &DL;
   }
-  virtual const SystemZRegisterInfo *getRegisterInfo() const LLVM_OVERRIDE {
+  const SystemZRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  virtual const SystemZTargetLowering *getTargetLowering() const LLVM_OVERRIDE {
+  const SystemZTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const
-    LLVM_OVERRIDE {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
   // Override LLVMTargetMachine
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM) LLVM_OVERRIDE;
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
index b6051d3..1ebc669 100644
--- a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
+++ b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMSystemZInfo
   SystemZTargetInfo.cpp
   )
-
-add_dependencies(LLVMSystemZInfo SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt b/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt
index ea43736..a5547e6 100644
--- a/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZInfo
 parent = SystemZ
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = SystemZ
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 2190198..627786d 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -42,7 +42,7 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfo *P) {
 }
 
 void llvm::initializeTarget(PassRegistry &Registry) {
-  initializeDataLayoutPass(Registry);
+  initializeDataLayoutPassPass(Registry);
   initializeTargetLibraryInfoPass(Registry);
 }
 
@@ -55,7 +55,9 @@ LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
 }
 
 void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
-  unwrap(PM)->add(new DataLayout(*unwrap(TD)));
+  // The DataLayoutPass must now be in sync with the module. Unfortunatelly we
+  // cannot enforce that from the C api.
+  unwrap(PM)->add(new DataLayoutPass(*unwrap(TD)));
 }
 
 void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
@@ -113,7 +115,7 @@ unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
 }
 
 unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return unwrap(TD)->getCallFrameTypeAlignment(unwrap(Ty));
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
 }
 
 unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 3e68fe1..6ec0b1f 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -48,7 +48,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__isoc99_sscanf",
     "__memcpy_chk",
     "__sincospi_stret",
-    "__sincospi_stretf",
+    "__sincospif_stret",
     "__sinpi",
     "__sinpif",
     "__sqrt_finite",
@@ -140,6 +140,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "floor",
     "floorf",
     "floorl",
+    "fmax",
+    "fmaxf",
+    "fmaxl",
+    "fmin",
+    "fminf",
+    "fminl",
     "fmod",
     "fmodf",
     "fmodl",
@@ -184,6 +190,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "isdigit",
     "labs",
     "lchown",
+    "ldexp",
+    "ldexpf",
+    "ldexpl",
     "llabs",
     "log",
     "log10",
@@ -369,8 +378,17 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
       llvm_unreachable("TargetLibraryInfo function names must be sorted");
   }
 #endif // !NDEBUG
-  
-  // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later.
+
+  // There are no library implementations of mempcy and memset for r600 and
+  // these can be difficult to lower in the backend.
+  if (T.getArch() == Triple::r600) {
+    TLI.setUnavailable(LibFunc::memcpy);
+    TLI.setUnavailable(LibFunc::memset);
+    TLI.setUnavailable(LibFunc::memset_pattern16);
+    return;
+  }
+
+  // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later.
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
       TLI.setUnavailable(LibFunc::memset_pattern16);
@@ -387,7 +405,7 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::cospi);
     TLI.setUnavailable(LibFunc::cospif);
     TLI.setUnavailable(LibFunc::sincospi_stret);
-    TLI.setUnavailable(LibFunc::sincospi_stretf);
+    TLI.setUnavailable(LibFunc::sincospif_stret);
   }
 
   if (T.isMacOSX() && T.getArch() == Triple::x86 &&
@@ -408,7 +426,7 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::fiprintf);
   }
 
-  if (T.getOS() == Triple::Win32) {
+  if (T.isKnownWindowsMSVCEnvironment()) {
     // Win32 does not support long double
     TLI.setUnavailable(LibFunc::acosl);
     TLI.setUnavailable(LibFunc::asinl);
@@ -422,8 +440,12 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
     TLI.setUnavailable(LibFunc::fabsl);
     TLI.setUnavailable(LibFunc::floorl);
+    TLI.setUnavailable(LibFunc::fmaxl);
+    TLI.setUnavailable(LibFunc::fminl);
     TLI.setUnavailable(LibFunc::fmodl);
     TLI.setUnavailable(LibFunc::frexpl);
+    TLI.setUnavailable(LibFunc::ldexpf);
+    TLI.setUnavailable(LibFunc::ldexpl);
     TLI.setUnavailable(LibFunc::logl);
     TLI.setUnavailable(LibFunc::modfl);
     TLI.setUnavailable(LibFunc::powl);
@@ -446,9 +468,6 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::cbrt);
     TLI.setUnavailable(LibFunc::cbrtf);
     TLI.setUnavailable(LibFunc::cbrtl);
-    TLI.setUnavailable(LibFunc::exp10);
-    TLI.setUnavailable(LibFunc::exp10f);
-    TLI.setUnavailable(LibFunc::exp10l);
     TLI.setUnavailable(LibFunc::exp2);
     TLI.setUnavailable(LibFunc::exp2f);
     TLI.setUnavailable(LibFunc::exp2l);
@@ -492,6 +511,8 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc::coshf);
       TLI.setUnavailable(LibFunc::expf);
       TLI.setUnavailable(LibFunc::floorf);
+      TLI.setUnavailable(LibFunc::fminf);
+      TLI.setUnavailable(LibFunc::fmaxf);
       TLI.setUnavailable(LibFunc::fmodf);
       TLI.setUnavailable(LibFunc::logf);
       TLI.setUnavailable(LibFunc::powf);
@@ -567,6 +588,43 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::llabs);
   }
 
+  switch (T.getOS()) {
+  case Triple::MacOSX:
+    // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
+    // and their names are __exp10 and __exp10f. exp10l is not available on
+    // OS X or iOS.
+    TLI.setUnavailable(LibFunc::exp10l);
+    if (T.isMacOSXVersionLT(10, 9)) {
+      TLI.setUnavailable(LibFunc::exp10);
+      TLI.setUnavailable(LibFunc::exp10f);
+    } else {
+      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+    }
+    break;
+  case Triple::IOS:
+    TLI.setUnavailable(LibFunc::exp10l);
+    if (T.isOSVersionLT(7, 0)) {
+      TLI.setUnavailable(LibFunc::exp10);
+      TLI.setUnavailable(LibFunc::exp10f);
+    } else {
+      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+    }
+    break;
+  case Triple::Linux:
+    // exp10, exp10f, exp10l is available on Linux (GLIBC) but are extremely
+    // buggy prior to glibc version 2.18. Until this version is widely deployed
+    // or we have a reasonable detection strategy, we cannot use exp10 reliably
+    // on Linux.
+    //
+    // Fall through to disable all of them.
+  default:
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
+  }
+
   // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
   // Linux (GLIBC):
   // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 7b8d110..50b1e31 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -18,6 +18,8 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
@@ -25,7 +27,7 @@
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
@@ -40,6 +42,7 @@ using namespace llvm;
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
   Ctx = &ctx;
+  DL = TM.getDataLayout();
   InitMCObjectFileInfo(TM.getTargetTriple(),
                        TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
@@ -97,20 +100,22 @@ static bool IsNullTerminatedString(const Constant *C) {
   return false;
 }
 
-/// Return the MCSymbol for the specified global value.  This
-/// symbol is the main label that is the address of the global.
-MCSymbol *TargetLoweringObjectFile::getSymbol(Mangler &M, 
-                                              const GlobalValue *GV) const {
+MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
+    const GlobalValue *GV, StringRef Suffix, Mangler &Mang,
+    const TargetMachine &TM) const {
+  assert(!Suffix.empty());
+
   SmallString<60> NameStr;
-  M.getNameWithPrefix(NameStr, GV, false);
+  NameStr += DL->getPrivateGlobalPrefix();
+  TM.getNameWithPrefix(NameStr, GV, Mang);
+  NameStr.append(Suffix.begin(), Suffix.end());
   return Ctx->GetOrCreateSymbol(NameStr.str());
 }
 
-
-MCSymbol *TargetLoweringObjectFile::
-getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
-                        MachineModuleInfo *MMI) const {
-  return getSymbol(*Mang, GV);
+MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
 }
 
 void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
@@ -252,7 +257,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
 /// the specified global variable or function definition.  This should not
 /// be passed external (or available externally) globals.
 const MCSection *TargetLoweringObjectFile::
-SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler *Mang,
+SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
                  const TargetMachine &TM) const {
   // Select section name.
   if (GV->hasSection())
@@ -263,12 +268,16 @@ SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler *Mang,
   return SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
 
+bool TargetLoweringObjectFile::isSectionAtomizableBySymbols(
+    const MCSection &Section) const {
+  return false;
+}
 
 // Lame default implementation. Calculate the section name for global.
 const MCSection *
 TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
                                                  SectionKind Kind,
-                                                 Mangler *Mang,
+                                                 Mangler &Mang,
                                                  const TargetMachine &TM) const{
   assert(!Kind.isThreadLocal() && "Doesn't support TLS");
 
@@ -298,12 +307,12 @@ TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
 /// getTTypeGlobalReference - Return an MCExpr to use for a
 /// reference to the specified global variable from exception
 /// handling information.
-const MCExpr *TargetLoweringObjectFile::
-getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                        MachineModuleInfo *MMI, unsigned Encoding,
-                        MCStreamer &Streamer) const {
+const MCExpr *TargetLoweringObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
   const MCSymbolRefExpr *Ref =
-    MCSymbolRefExpr::Create(getSymbol(*Mang, GV), getContext());
+      MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang), getContext());
 
   return getTTypeReference(Ref, Encoding, Streamer);
 }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index cb42e83..fe3c870 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -17,9 +17,14 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 //---------------------------------------------------------------------------
@@ -52,9 +57,9 @@ TargetMachine::TargetMachine(const Target &T,
     MCRelaxAll(false),
     MCNoExecStack(false),
     MCSaveTempLabels(false),
-    MCUseLoc(true),
     MCUseCFI(true),
     MCUseDwarfDirectory(false),
+    RequireStructuredCFG(false),
     Options(Options) {
 }
 
@@ -67,7 +72,7 @@ TargetMachine::~TargetMachine() {
 void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
   const Function *F = MF->getFunction();
   TargetOptions &TO = MF->getTarget().Options;
-  
+
 #define RESET_OPTION(X, Y)                                              \
   do {                                                                  \
     if (F->hasFnAttribute(Y))                                           \
@@ -124,7 +129,7 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   // If GV is an alias then use the aliasee for determining
   // thread-localness.
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->resolveAliasedGlobal(false);
+    GV = GA->getAliasedGlobal();
   const GlobalVariable *Var = cast<GlobalVariable>(GV);
 
   bool isLocal = Var->hasLocalLinkage();
@@ -192,3 +197,28 @@ void TargetMachine::setFunctionSections(bool V) {
 void TargetMachine::setDataSections(bool V) {
   DataSections = V;
 }
+
+void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
+                                      const GlobalValue *GV, Mangler &Mang,
+                                      bool MayAlwaysUsePrivate) const {
+  if (MayAlwaysUsePrivate || !GV->hasPrivateLinkage()) {
+    // Simple case: If GV is not private, it is not important to find out if
+    // private labels are legal in this case or not.
+    Mang.getNameWithPrefix(Name, GV, false);
+    return;
+  }
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this);
+  const TargetLoweringObjectFile &TLOF =
+      getTargetLowering()->getObjFileLowering();
+  const MCSection *TheSection = TLOF.SectionForGlobal(GV, GVKind, Mang, *this);
+  bool CannotUsePrivateLabel = TLOF.isSectionAtomizableBySymbols(*TheSection);
+  Mang.getNameWithPrefix(Name, GV, CannotUsePrivateLabel);
+}
+
+MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const {
+  SmallString<60> NameStr;
+  getNameWithPrefix(NameStr, GV, Mang);
+  const TargetLoweringObjectFile &TLOF =
+      getTargetLowering()->getObjFileLowering();
+  return TLOF.getContext().GetOrCreateSymbol(NameStr.str());
+}
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 3d5f827..a2829d4 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -19,9 +19,9 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdlib>
@@ -212,7 +212,8 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
-  pass.add(new DataLayout(*td));
+  Mod->setDataLayout(td);
+  pass.add(new DataLayoutPass(Mod));
 
   TargetMachine::CodeGenFileType ft;
   switch (codegen) {
@@ -238,7 +239,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
   std::string error;
-  raw_fd_ostream dest(Filename, error, sys::fs::F_Binary);
+  raw_fd_ostream dest(Filename, error, sys::fs::F_None);
   if (!error.empty()) {
     *ErrorMessage = strdup(error.c_str());
     return true;
@@ -267,3 +268,7 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
 char *LLVMGetDefaultTargetTriple(void) {
   return strdup(sys::getDefaultTargetTriple().c_str());
 }
+
+void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
+  unwrap(T)->addAnalysisPasses(*unwrap(PM));
+}
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 10e8db5..df8948f 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 //---------------------------------------------------------------------------
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index c9840c3..73031de 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -53,6 +53,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -68,3 +69,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/X86/AsmParser/Android.mk b/lib/Target/X86/AsmParser/Android.mk
index d6de437..4235cb1 100644
--- a/lib/Target/X86/AsmParser/Android.mk
+++ b/lib/Target/X86/AsmParser/Android.mk
@@ -6,6 +6,7 @@ include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
 x86_asm_parser_SRC_FILES :=	\
+	X86AsmInstrumentation.cpp \
 	X86AsmParser.cpp
 
 x86_asm_parser_TBLGEN_TABLES :=	\
@@ -41,6 +42,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 #===---------------------------------------------------------------===
 # libX86AsmParser (target)
 #===---------------------------------------------------------------===
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -54,3 +56,4 @@ TBLGEN_TD_DIR := $(x86_asm_parser_TBLGEN_TD_DIR)
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index 54204d4..b022a41 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -1,7 +1,4 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMX86AsmParser
+  X86AsmInstrumentation.cpp
   X86AsmParser.cpp
   )
-
-add_dependencies(LLVMX86AsmParser X86CommonTableGen)
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
new file mode 100644
index 0000000..db29228
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -0,0 +1,236 @@
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86Operand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+
+namespace llvm {
+namespace {
+
+static cl::opt<bool> ClAsanInstrumentInlineAssembly(
+    "asan-instrument-inline-assembly", cl::desc("instrument inline assembly"),
+    cl::Hidden, cl::init(false));
+
+bool IsStackReg(unsigned Reg) {
+  return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP;
+}
+
+std::string FuncName(unsigned AccessSize, bool IsWrite) {
+  return std::string("__sanitizer_sanitize_") + (IsWrite ? "store" : "load") +
+         (utostr(AccessSize));
+}
+
+class X86AddressSanitizer : public X86AsmInstrumentation {
+public:
+  X86AddressSanitizer(MCSubtargetInfo &sti) : STI(sti) {}
+  virtual ~X86AddressSanitizer() {}
+
+  // X86AsmInstrumentation implementation:
+  virtual void InstrumentInstruction(
+      const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+      MCContext &Ctx, MCStreamer &Out) override {
+    InstrumentMOV(Inst, Operands, Ctx, Out);
+  }
+
+  // Should be implemented differently in x86_32 and x86_64 subclasses.
+  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
+                                        bool IsWrite, MCContext &Ctx,
+                                        MCStreamer &Out) = 0;
+
+  void InstrumentMemOperand(MCParsedAsmOperand *Op, unsigned AccessSize,
+                            bool IsWrite, MCContext &Ctx, MCStreamer &Out);
+  void InstrumentMOV(const MCInst &Inst,
+                     SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                     MCContext &Ctx, MCStreamer &Out);
+  void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
+    Out.EmitInstruction(Inst, STI);
+  }
+
+protected:
+  MCSubtargetInfo &STI;
+};
+
+void X86AddressSanitizer::InstrumentMemOperand(
+    MCParsedAsmOperand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  assert(Op && Op->isMem() && "Op should be a memory operand.");
+  assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
+         "AccessSize should be a power of two, less or equal than 16.");
+
+  X86Operand *MemOp = static_cast<X86Operand *>(Op);
+  // FIXME: get rid of this limitation.
+  if (IsStackReg(MemOp->getMemBaseReg()) || IsStackReg(MemOp->getMemIndexReg()))
+    return;
+
+  InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(
+    const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+    MCContext &Ctx, MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+  unsigned long OpIx = Operands.size();
+  switch (Inst.getOpcode()) {
+  case X86::MOV8mi:
+  case X86::MOV8mr:
+    AccessSize = 1;
+    OpIx = 2;
+    break;
+  case X86::MOV8rm:
+    AccessSize = 1;
+    OpIx = 1;
+    break;
+  case X86::MOV16mi:
+  case X86::MOV16mr:
+    AccessSize = 2;
+    OpIx = 2;
+    break;
+  case X86::MOV16rm:
+    AccessSize = 2;
+    OpIx = 1;
+    break;
+  case X86::MOV32mi:
+  case X86::MOV32mr:
+    AccessSize = 4;
+    OpIx = 2;
+    break;
+  case X86::MOV32rm:
+    AccessSize = 4;
+    OpIx = 1;
+    break;
+  case X86::MOV64mi32:
+  case X86::MOV64mr:
+    AccessSize = 8;
+    OpIx = 2;
+    break;
+  case X86::MOV64rm:
+    AccessSize = 8;
+    OpIx = 1;
+    break;
+  case X86::MOVAPDmr:
+  case X86::MOVAPSmr:
+    AccessSize = 16;
+    OpIx = 2;
+    break;
+  case X86::MOVAPDrm:
+  case X86::MOVAPSrm:
+    AccessSize = 16;
+    OpIx = 1;
+    break;
+  }
+  if (OpIx >= Operands.size())
+    return;
+
+  const bool IsWrite = (OpIx != 1);
+  InstrumentMemOperand(Operands[OpIx], AccessSize, IsWrite, Ctx, Out);
+}
+
+class X86AddressSanitizer32 : public X86AddressSanitizer {
+public:
+  X86AddressSanitizer32(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+  virtual ~X86AddressSanitizer32() {}
+
+  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
+                                        bool IsWrite, MCContext &Ctx,
+                                        MCStreamer &Out) override;
+};
+
+void X86AddressSanitizer32::InstrumentMemOperandImpl(
+    X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  // FIXME: emit .cfi directives for correct stack unwinding.
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+  {
+    const std::string Func = FuncName(AccessSize, IsWrite);
+    const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func));
+    const MCSymbolRefExpr *FuncExpr =
+        MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
+  }
+  EmitInstruction(Out, MCInstBuilder(X86::ADD32ri).addReg(X86::ESP)
+                           .addReg(X86::ESP).addImm(4));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+  X86AddressSanitizer64(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+  virtual ~X86AddressSanitizer64() {}
+
+  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
+                                        bool IsWrite, MCContext &Ctx,
+                                        MCStreamer &Out) override;
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandImpl(
+    X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  // FIXME: emit .cfi directives for correct stack unwinding.
+  // Set %rsp below current red zone (128 bytes wide)
+  EmitInstruction(Out, MCInstBuilder(X86::SUB64ri32).addReg(X86::RSP)
+                           .addReg(X86::RSP).addImm(128));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RDI));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+  {
+    const std::string Func = FuncName(AccessSize, IsWrite);
+    const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func));
+    const MCSymbolRefExpr *FuncExpr =
+        MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr));
+  }
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::ADD64ri32).addReg(X86::RSP)
+                           .addReg(X86::RSP).addImm(128));
+}
+
+} // End anonymous namespace
+
+X86AsmInstrumentation::X86AsmInstrumentation() {}
+X86AsmInstrumentation::~X86AsmInstrumentation() {}
+
+void X86AsmInstrumentation::InstrumentInstruction(
+    const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+    MCContext &Ctx, MCStreamer &Out) {}
+
+X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI) {
+  if (ClAsanInstrumentInlineAssembly) {
+    if ((STI.getFeatureBits() & X86::Mode32Bit) != 0)
+      return new X86AddressSanitizer32(STI);
+    if ((STI.getFeatureBits() & X86::Mode64Bit) != 0)
+      return new X86AddressSanitizer64(STI);
+  }
+  return new X86AsmInstrumentation();
+}
+
+} // End llvm namespace
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
new file mode 100644
index 0000000..c783a78
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -0,0 +1,46 @@
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ASM_INSTRUMENTATION_H
+#define X86_ASM_INSTRUMENTATION_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCParsedAsmOperand;
+class MCStreamer;
+class MCSubtargetInfo;
+
+class X86AsmInstrumentation;
+
+X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+
+class X86AsmInstrumentation {
+public:
+  virtual ~X86AsmInstrumentation();
+
+  // Instruments Inst. Should be called just before the original
+  // instruction is sent to Out.
+  virtual void InstrumentInstruction(
+      const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+      MCContext &Ctx, MCStreamer &Out);
+
+protected:
+  friend X86AsmInstrumentation *
+  CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+
+  X86AsmInstrumentation();
+};
+
+}  // End llvm namespace
+
+#endif  // X86_ASM_INSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index bc8f367..9eddc74 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -8,6 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
@@ -28,19 +31,23 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 using namespace llvm;
 
 namespace {
-struct X86Operand;
 
 static const char OpPrecedence[] = {
-  0, // IC_PLUS
-  0, // IC_MINUS
-  1, // IC_MULTIPLY
-  1, // IC_DIVIDE
-  2, // IC_RPAREN
-  3, // IC_LPAREN
+  0, // IC_OR
+  1, // IC_AND
+  2, // IC_LSHIFT
+  2, // IC_RSHIFT
+  3, // IC_PLUS
+  3, // IC_MINUS
+  4, // IC_MULTIPLY
+  4, // IC_DIVIDE
+  5, // IC_RPAREN
+  6, // IC_LPAREN
   0, // IC_IMM
   0  // IC_REGISTER
 };
@@ -49,9 +56,20 @@ class X86AsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   ParseInstructionInfo *InstInfo;
+  std::unique_ptr<X86AsmInstrumentation> Instrumentation;
 private:
+  SMLoc consumeToken() {
+    SMLoc Result = Parser.getTok().getLoc();
+    Parser.Lex();
+    return Result;
+  }
+
   enum InfixCalculatorTok {
-    IC_PLUS = 0,
+    IC_OR = 0,
+    IC_AND,
+    IC_LSHIFT,
+    IC_RSHIFT,
+    IC_PLUS,
     IC_MINUS,
     IC_MULTIPLY,
     IC_DIVIDE,
@@ -176,6 +194,30 @@ private:
             Val = Op1.second / Op2.second;
             OperandStack.push_back(std::make_pair(IC_IMM, Val));
             break;
+          case IC_OR:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Or operation with an immediate and a register!");
+            Val = Op1.second | Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_AND:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "And operation with an immediate and a register!");
+            Val = Op1.second & Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_LSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Left shift operation with an immediate and a register!");
+            Val = Op1.second << Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_RSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Right shift operation with an immediate and a register!");
+            Val = Op1.second >> Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
           }
         }
       }
@@ -185,6 +227,10 @@ private:
   };
 
   enum IntelExprState {
+    IES_OR,
+    IES_AND,
+    IES_LSHIFT,
+    IES_RSHIFT,
     IES_PLUS,
     IES_MINUS,
     IES_MULTIPLY,
@@ -231,6 +277,66 @@ private:
       return Info;
     }
 
+    void onOr() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_OR;
+        IC.pushOperator(IC_OR);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onAnd() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_AND;
+        IC.pushOperator(IC_AND);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_LSHIFT;
+        IC.pushOperator(IC_LSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_RSHIFT;
+        IC.pushOperator(IC_RSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
     void onPlus() {
       IntelExprState CurrState = State;
       switch (State) {
@@ -337,7 +443,7 @@ private:
         break;
       }
     }
-    void onInteger(int64_t TmpInt) {
+    bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
       IntelExprState CurrState = State;
       switch (State) {
       default:
@@ -345,6 +451,10 @@ private:
         break;
       case IES_PLUS:
       case IES_MINUS:
+      case IES_OR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_DIVIDE:
       case IES_MULTIPLY:
       case IES_LPAREN:
@@ -354,9 +464,15 @@ private:
           assert (!IndexReg && "IndexReg already set!");
           IndexReg = TmpReg;
           Scale = TmpInt;
+          if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+            ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+            return true;
+          }
           // Get the scale and replace the 'Register * Scale' with '0'.
           IC.popOperator();
         } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
                     PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
                     PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
                    CurrState == IES_MINUS) {
@@ -369,6 +485,7 @@ private:
         break;
       }
       PrevState = CurrState;
+      return false;
     }
     void onStar() {
       PrevState = State;
@@ -442,11 +559,17 @@ private:
         break;
       case IES_PLUS:
       case IES_MINUS:
+      case IES_OR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
       case IES_LPAREN:
         // FIXME: We don't handle this type of unary minus, yet.
         if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+            PrevState == IES_OR || PrevState == IES_AND ||
+            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
             PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
             PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
             CurrState == IES_MINUS) {
@@ -486,11 +609,20 @@ private:
     return Parser.Error(L, Msg, Ranges);
   }
 
+  bool ErrorAndEatStatement(SMLoc L, const Twine &Msg,
+          ArrayRef<SMRange> Ranges = None,
+          bool MatchingInlineAsm = false) {
+      Parser.eatToEndOfStatement();
+      return Error(L, Msg, Ranges, MatchingInlineAsm);
+  }
+
   X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
     Error(Loc, Msg);
     return 0;
   }
 
+  X86Operand *DefaultMemSIOperand(SMLoc Loc);
+  X86Operand *DefaultMemDIOperand(SMLoc Loc);
   X86Operand *ParseOperand();
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
@@ -521,26 +653,47 @@ private:
   bool processInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
 
+  /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+  /// instrumentation around Inst.
+  void EmitInstruction(MCInst &Inst,
+                       SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                       MCStreamer &Out);
+
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
+                               bool MatchingInlineAsm) override;
 
-  /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
-  /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
-  bool isSrcOp(X86Operand &Op);
+  /// doSrcDstMatch - Returns true if operands are matching in their
+  /// word size (%si and %di, %esi and %edi, etc.). Order depends on
+  /// the parsing mode (Intel vs. AT&T).
+  bool doSrcDstMatch(X86Operand &Op1, X86Operand &Op2);
 
-  /// isDstOp - Returns true if operand is either (%rdi) or %es:(%rdi)
-  /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode.
-  bool isDstOp(X86Operand &Op);
+  /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+  /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+  /// \return \c true if no parsing errors occurred, \c false otherwise.
+  bool HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                            const MCParsedAsmOperand &Op);
 
   bool is64BitMode() const {
     // FIXME: Can tablegen auto-generate this?
     return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
   }
-  void SwitchMode() {
-    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit));
+  bool is32BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode32Bit) != 0;
+  }
+  bool is16BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode16Bit) != 0;
+  }
+  void SwitchMode(uint64_t mode) {
+    uint64_t oldMode = STI.getFeatureBits() &
+        (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit);
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(oldMode | mode));
     setAvailableFeatures(FB);
+    assert(mode == (STI.getFeatureBits() &
+                    (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit)));
   }
 
   bool isParsingIntelSyntax() {
@@ -562,14 +715,15 @@ public:
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    Instrumentation.reset(CreateX86AsmInstrumentation(STI));
   }
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
-  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool
+    ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                     SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
 
-  virtual bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirective(AsmToken DirectiveID) override;
 };
 } // end anonymous namespace
 
@@ -580,470 +734,63 @@ static unsigned MatchRegisterName(StringRef Name);
 
 /// }
 
-static bool isImmSExti16i8Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000000000007FULL)||
-          (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
-}
-
-static bool isImmSExti32i8Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000000000007FULL)||
-          (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
-}
-
-static bool isImmZExtu32u8Value(uint64_t Value) {
-    return (Value <= 0x00000000000000FFULL);
-}
-
-static bool isImmSExti64i8Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000000000007FULL)||
-          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
-}
-
-static bool isImmSExti64i32Value(uint64_t Value) {
-  return ((                                  Value <= 0x000000007FFFFFFFULL)||
-          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
-}
-namespace {
-
-/// X86Operand - Instances of this class represent a parsed X86 machine
-/// instruction.
-struct X86Operand : public MCParsedAsmOperand {
-  enum KindTy {
-    Token,
-    Register,
-    Immediate,
-    Memory
-  } Kind;
-
-  SMLoc StartLoc, EndLoc;
-  SMLoc OffsetOfLoc;
-  StringRef SymName;
-  void *OpDecl;
-  bool AddressOf;
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-  };
-
-  struct RegOp {
-    unsigned RegNo;
-  };
-
-  struct ImmOp {
-    const MCExpr *Val;
-  };
-
-  struct MemOp {
-    unsigned SegReg;
-    const MCExpr *Disp;
-    unsigned BaseReg;
-    unsigned IndexReg;
-    unsigned Scale;
-    unsigned Size;
-  };
-
-  union {
-    struct TokOp Tok;
-    struct RegOp Reg;
-    struct ImmOp Imm;
-    struct MemOp Mem;
-  };
-
-  X86Operand(KindTy K, SMLoc Start, SMLoc End)
-    : Kind(K), StartLoc(Start), EndLoc(End) {}
-
-  StringRef getSymName() { return SymName; }
-  void *getOpDecl() { return OpDecl; }
-
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
-  /// getLocRange - Get the range between the first and last token of this
-  /// operand.
-  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
-  /// getOffsetOfLoc - Get the location of the offset operator.
-  SMLoc getOffsetOfLoc() const { return OffsetOfLoc; }
-
-  virtual void print(raw_ostream &OS) const {}
-
-  StringRef getToken() const {
-    assert(Kind == Token && "Invalid access!");
-    return StringRef(Tok.Data, Tok.Length);
-  }
-  void setTokenValue(StringRef Value) {
-    assert(Kind == Token && "Invalid access!");
-    Tok.Data = Value.data();
-    Tok.Length = Value.size();
-  }
-
-  unsigned getReg() const {
-    assert(Kind == Register && "Invalid access!");
-    return Reg.RegNo;
-  }
-
-  const MCExpr *getImm() const {
-    assert(Kind == Immediate && "Invalid access!");
-    return Imm.Val;
-  }
-
-  const MCExpr *getMemDisp() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.Disp;
-  }
-  unsigned getMemSegReg() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.SegReg;
-  }
-  unsigned getMemBaseReg() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.BaseReg;
-  }
-  unsigned getMemIndexReg() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.IndexReg;
-  }
-  unsigned getMemScale() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.Scale;
-  }
-
-  bool isToken() const {return Kind == Token; }
-
-  bool isImm() const { return Kind == Immediate; }
-
-  bool isImmSExti16i8() const {
-    if (!isImm())
-      return false;
-
-    // If this isn't a constant expr, just assume it fits and let relaxation
-    // handle it.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return true;
-
-    // Otherwise, check the value is in a range that makes sense for this
-    // extension.
-    return isImmSExti16i8Value(CE->getValue());
-  }
-  bool isImmSExti32i8() const {
-    if (!isImm())
-      return false;
-
-    // If this isn't a constant expr, just assume it fits and let relaxation
-    // handle it.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return true;
-
-    // Otherwise, check the value is in a range that makes sense for this
-    // extension.
-    return isImmSExti32i8Value(CE->getValue());
-  }
-  bool isImmZExtu32u8() const {
-    if (!isImm())
-      return false;
-
-    // If this isn't a constant expr, just assume it fits and let relaxation
-    // handle it.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return true;
-
-    // Otherwise, check the value is in a range that makes sense for this
-    // extension.
-    return isImmZExtu32u8Value(CE->getValue());
-  }
-  bool isImmSExti64i8() const {
-    if (!isImm())
-      return false;
-
-    // If this isn't a constant expr, just assume it fits and let relaxation
-    // handle it.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
+                                    StringRef &ErrMsg) {
+  // If we have both a base register and an index register make sure they are
+  // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+  if (BaseReg != 0 && IndexReg != 0) {
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::RIZ) {
+      ErrMsg = "base register is 64-bit, but index register is not";
       return true;
-
-    // Otherwise, check the value is in a range that makes sense for this
-    // extension.
-    return isImmSExti64i8Value(CE->getValue());
-  }
-  bool isImmSExti64i32() const {
-    if (!isImm())
-      return false;
-
-    // If this isn't a constant expr, just assume it fits and let relaxation
-    // handle it.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+    }
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::EIZ){
+      ErrMsg = "base register is 32-bit, but index register is not";
       return true;
-
-    // Otherwise, check the value is in a range that makes sense for this
-    // extension.
-    return isImmSExti64i32Value(CE->getValue());
-  }
-
-  bool isOffsetOf() const {
-    return OffsetOfLoc.getPointer();
-  }
-
-  bool needAddressOf() const {
-    return AddressOf;
-  }
-
-  bool isMem() const { return Kind == Memory; }
-  bool isMem8() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 8);
-  }
-  bool isMem16() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 16);
-  }
-  bool isMem32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32);
-  }
-  bool isMem64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64);
-  }
-  bool isMem80() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 80);
-  }
-  bool isMem128() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 128);
-  }
-  bool isMem256() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 256);
-  }
-  bool isMem512() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
-  }
-
-  bool isMemVX32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
-  }
-  bool isMemVY32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
-  }
-  bool isMemVX64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
-  }
-  bool isMemVY64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
-  }
-  bool isMemVZ32() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
-      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
-  }
-  bool isMemVZ64() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
-      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
-  }
-
-  bool isAbsMem() const {
-    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1;
-  }
-
-  bool isMemOffs8() const {
-    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
-  }
-  bool isMemOffs16() const {
-    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
-  }
-  bool isMemOffs32() const {
-    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
-  }
-  bool isMemOffs64() const {
-    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
-      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
-  }
-
-  bool isReg() const { return Kind == Register; }
-
-  bool isGR32orGR64() const {
-    return Kind == Register &&
-      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
-      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
-  }
-
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  static unsigned getGR32FromGR64(unsigned RegNo) {
-    switch (RegNo) {
-    default: llvm_unreachable("Unexpected register");
-    case X86::RAX: return X86::EAX;
-    case X86::RCX: return X86::ECX;
-    case X86::RDX: return X86::EDX;
-    case X86::RBX: return X86::EBX;
-    case X86::RBP: return X86::EBP;
-    case X86::RSP: return X86::ESP;
-    case X86::RSI: return X86::ESI;
-    case X86::RDI: return X86::EDI;
-    case X86::R8: return X86::R8D;
-    case X86::R9: return X86::R9D;
-    case X86::R10: return X86::R10D;
-    case X86::R11: return X86::R11D;
-    case X86::R12: return X86::R12D;
-    case X86::R13: return X86::R13D;
-    case X86::R14: return X86::R14D;
-    case X86::R15: return X86::R15D;
-    case X86::RIP: return X86::EIP;
-    }
-  }
-
-  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    unsigned RegNo = getReg();
-    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
-      RegNo = getGR32FromGR64(RegNo);
-    Inst.addOperand(MCOperand::CreateReg(RegNo));
-  }
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
-  }
-
-  void addMemOperands(MCInst &Inst, unsigned N) const {
-    assert((N == 5) && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
-    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
-    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
-    addExpr(Inst, getMemDisp());
-    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
-  }
-
-  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
-    assert((N == 1) && "Invalid number of operands!");
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
-  }
-
-  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
-    assert((N == 1) && "Invalid number of operands!");
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
-  }
-
-  static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
-    SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
-    X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
-    Res->Tok.Data = Str.data();
-    Res->Tok.Length = Str.size();
-    return Res;
-  }
-
-  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
-                               bool AddressOf = false,
-                               SMLoc OffsetOfLoc = SMLoc(),
-                               StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
-    X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
-    Res->Reg.RegNo = RegNo;
-    Res->AddressOf = AddressOf;
-    Res->OffsetOfLoc = OffsetOfLoc;
-    Res->SymName = SymName;
-    Res->OpDecl = OpDecl;
-    return Res;
-  }
-
-  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
-    X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
-    Res->Imm.Val = Val;
-    return Res;
-  }
-
-  /// Create an absolute memory operand.
-  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0, StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
-    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
-    Res->Mem.SegReg   = 0;
-    Res->Mem.Disp     = Disp;
-    Res->Mem.BaseReg  = 0;
-    Res->Mem.IndexReg = 0;
-    Res->Mem.Scale    = 1;
-    Res->Mem.Size     = Size;
-    Res->SymName      = SymName;
-    Res->OpDecl       = OpDecl;
-    Res->AddressOf    = false;
-    return Res;
-  }
-
-  /// Create a generalized memory operand.
-  static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
-                               unsigned BaseReg, unsigned IndexReg,
-                               unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0,
-                               StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
-    // We should never just have a displacement, that should be parsed as an
-    // absolute memory operand.
-    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
-
-    // The scale should always be one of {1,2,4,8}.
-    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
-           "Invalid scale!");
-    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
-    Res->Mem.SegReg   = SegReg;
-    Res->Mem.Disp     = Disp;
-    Res->Mem.BaseReg  = BaseReg;
-    Res->Mem.IndexReg = IndexReg;
-    Res->Mem.Scale    = Scale;
-    Res->Mem.Size     = Size;
-    Res->SymName      = SymName;
-    Res->OpDecl       = OpDecl;
-    Res->AddressOf    = false;
-    return Res;
+    }
+    if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+      if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+        ErrMsg = "base register is 16-bit, but index register is not";
+        return true;
+      }
+      if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
+           IndexReg != X86::SI && IndexReg != X86::DI) ||
+          ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+           IndexReg != X86::BX && IndexReg != X86::BP)) {
+        ErrMsg = "invalid 16-bit base/index register combination";
+        return true;
+      }
+    }
   }
-};
-
-} // end anonymous namespace.
-
-bool X86AsmParser::isSrcOp(X86Operand &Op) {
-  unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI;
-
-  return (Op.isMem() &&
-    (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) &&
-    isa<MCConstantExpr>(Op.Mem.Disp) &&
-    cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
-    Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0);
+  return false;
 }
 
-bool X86AsmParser::isDstOp(X86Operand &Op) {
-  unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
+bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2)
+{
+  // Return true and let a normal complaint about bogus operands happen.
+  if (!Op1.isMem() || !Op2.isMem())
+    return true;
 
-  return Op.isMem() &&
-    (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
-    isa<MCConstantExpr>(Op.Mem.Disp) &&
-    cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
-    Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0;
+  // Actually these might be the other way round if Intel syntax is
+  // being used. It doesn't matter.
+  unsigned diReg = Op1.Mem.BaseReg;
+  unsigned siReg = Op2.Mem.BaseReg;
+
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR16RegClassID].contains(diReg);
+  if (X86MCRegisterClasses[X86::GR32RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR32RegClassID].contains(diReg);
+  if (X86MCRegisterClasses[X86::GR64RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR64RegClassID].contains(diReg);
+  // Again, return true and let another error happen.
+  return true;
 }
 
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
@@ -1073,7 +820,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
     RegNo = MatchRegisterName(Tok.getString().lower());
 
   if (!is64BitMode()) {
-    // FIXME: This should be done using Requires<In32BitMode> and
+    // FIXME: This should be done using Requires<Not64BitMode> and
     // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
     // checked.
     // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
@@ -1155,6 +902,22 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
   return false;
 }
 
+X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+  unsigned basereg =
+    is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
+  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg,
+                               /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
+}
+
+X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+  unsigned basereg =
+    is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
+  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg,
+                               /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
+}
+
 X86Operand *X86AsmParser::ParseOperand() {
   if (isParsingIntelSyntax())
     return ParseIntelOperand();
@@ -1171,6 +934,8 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
     .Cases("XWORD", "xword", 80)
     .Cases("XMMWORD", "xmmword", 128)
     .Cases("YMMWORD", "ymmword", 256)
+    .Cases("ZMMWORD", "zmmword", 512)
+    .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
     .Default(0);
   return Size;
 }
@@ -1181,16 +946,24 @@ X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
                                     unsigned Scale, SMLoc Start, SMLoc End,
                                     unsigned Size, StringRef Identifier,
                                     InlineAsmIdentifierInfo &Info){
-  if (isa<MCSymbolRefExpr>(Disp)) {
-    // If this is not a VarDecl then assume it is a FuncDecl or some other label
-    // reference.  We need an 'r' constraint here, so we need to create register
-    // operand to ensure proper matching.  Just pick a GPR based on the size of
-    // a pointer.
-    if (!Info.IsVarDecl) {
-      unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
-      return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true,
-                                   SMLoc(), Identifier, Info.OpDecl);
-    }
+  // If this is not a VarDecl then assume it is a FuncDecl or some other label
+  // reference.  We need an 'r' constraint here, so we need to create register
+  // operand to ensure proper matching.  Just pick a GPR based on the size of
+  // a pointer.
+  if (isa<MCSymbolRefExpr>(Disp) && !Info.IsVarDecl) {
+    unsigned RegNo =
+        is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
+    return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true,
+                                 SMLoc(), Identifier, Info.OpDecl);
+  }
+
+  // We either have a direct symbol reference, or an offset from a symbol.  The
+  // parser always puts the symbol on the LHS, so look there for size
+  // calculation purposes.
+  const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
+  bool IsSymRef =
+      isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
+  if (IsSymRef) {
     if (!Size) {
       Size = Info.Type * 8; // Size is in terms of bits in this context.
       if (Size)
@@ -1312,10 +1085,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
           if (getParser().parsePrimaryExpr(Val, End))
             return Error(Tok.getLoc(), "Unexpected identifier!");
         } else {
-          InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-          if (ParseIntelIdentifier(Val, Identifier, Info,
-                                   /*Unevaluated=*/false, End))
-            return true;
+          // This is a dot operator, not an adjacent identifier.
+          if (Identifier.find('.') != StringRef::npos) {
+            return false;
+          } else {
+            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+            if (ParseIntelIdentifier(Val, Identifier, Info,
+                                     /*Unevaluated=*/false, End))
+              return true;
+          }
         }
         SM.onIdentifierExpr(Val, Identifier);
         UpdateLocLex = false;
@@ -1323,16 +1101,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       }
       return Error(Tok.getLoc(), "Unexpected identifier!");
     }
-    case AsmToken::Integer:
+    case AsmToken::Integer: {
+      StringRef ErrMsg;
       if (isParsingInlineAsm() && SM.getAddImmPrefix())
         InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
                                                     Tok.getLoc()));
-      SM.onInteger(Tok.getIntVal());
+      // Look for 'b' or 'f' following an Integer as a directional label
+      SMLoc Loc = getTok().getLoc();
+      int64_t IntVal = getTok().getIntVal();
+      End = consumeToken();
+      UpdateLocLex = false;
+      if (getLexer().getKind() == AsmToken::Identifier) {
+        StringRef IDVal = getTok().getString();
+        if (IDVal == "f" || IDVal == "b") {
+          MCSymbol *Sym =
+              getContext().GetDirectionalLocalSymbol(IntVal, IDVal == "b");
+          MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+          const MCExpr *Val = 
+	    MCSymbolRefExpr::Create(Sym, Variant, getContext());
+          if (IDVal == "b" && Sym->isUndefined())
+            return Error(Loc, "invalid reference to undefined symbol");
+          StringRef Identifier = Sym->getName();
+          SM.onIdentifierExpr(Val, Identifier);
+          End = consumeToken();
+        } else {
+          if (SM.onInteger(IntVal, ErrMsg))
+            return Error(Loc, ErrMsg);
+        }
+      } else {
+        if (SM.onInteger(IntVal, ErrMsg))
+          return Error(Loc, ErrMsg);
+      }
       break;
+    }
     case AsmToken::Plus:    SM.onPlus(); break;
     case AsmToken::Minus:   SM.onMinus(); break;
     case AsmToken::Star:    SM.onStar(); break;
     case AsmToken::Slash:   SM.onDivide(); break;
+    case AsmToken::Pipe:    SM.onOr(); break;
+    case AsmToken::Amp:     SM.onAnd(); break;
+    case AsmToken::LessLess:
+                            SM.onLShift(); break;
+    case AsmToken::GreaterGreater:
+                            SM.onRShift(); break;
     case AsmToken::LBrac:   SM.onLBrac(); break;
     case AsmToken::RBrac:   SM.onRBrac(); break;
     case AsmToken::LParen:  SM.onLParen(); break;
@@ -1341,10 +1152,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     if (SM.hadError())
       return Error(Tok.getLoc(), "unknown token in expression");
 
-    if (!Done && UpdateLocLex) {
-      End = Tok.getLoc();
-      Parser.Lex(); // Consume the token.
-    }
+    if (!Done && UpdateLocLex)
+      End = consumeToken();
   }
   return false;
 }
@@ -1366,7 +1175,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   if (ParseIntelExpression(SM, End))
     return 0;
 
-  const MCExpr *Disp;
+  const MCExpr *Disp = 0;
   if (const MCExpr *Sym = SM.getSym()) {
     // A symbolic displacement.
     Disp = Sym;
@@ -1374,13 +1183,20 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
       RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(),
                                  ImmDisp, SM.getImm(), BracLoc, StartInBrac,
                                  End);
-  } else {
-    // An immediate displacement only.   
-    Disp = MCConstantExpr::Create(SM.getImm(), getContext());
   }
 
-  // Parse the dot operator (e.g., [ebx].foo.bar).
-  if (Tok.getString().startswith(".")) {
+  if (SM.getImm() || !Disp) {
+    const MCExpr *Imm = MCConstantExpr::Create(SM.getImm(), getContext());
+    if (Disp)
+      Disp = MCBinaryExpr::CreateAdd(Disp, Imm, getContext());
+    else
+      Disp = Imm;  // An immediate displacement only.
+  }
+
+  // Parse struct field access.  Intel requires a dot, but MSVC doesn't.  MSVC
+  // will in fact do global lookup the field name inside all global typedefs,
+  // but we don't emulate that.
+  if (Tok.getString().find('.') != StringRef::npos) {
     const MCExpr *NewDisp;
     if (ParseIntelDotOperator(Disp, NewDisp))
       return 0;
@@ -1401,6 +1217,11 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
       else
         return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
     }
+    StringRef ErrMsg;
+    if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+      Error(StartInBrac, ErrMsg);
+      return 0;
+    }
     return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
                                  End, Size);
   }
@@ -1502,6 +1323,7 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
   // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
   if (getLexer().is(AsmToken::LBrac))
     return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+  assert(ImmDisp == 0);
 
   const MCExpr *Val;
   if (!isParsingInlineAsm()) {
@@ -1516,8 +1338,39 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
     return 0;
-  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
-                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+
+  if (!getLexer().is(AsmToken::LBrac))
+    return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
+                                 /*Scale=*/1, Start, End, Size, Identifier, Info);
+
+  Parser.Lex(); // Eat '['
+
+  // Parse Identifier [ ImmDisp ]
+  IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
+                           /*AddImmPrefix=*/false);
+  if (ParseIntelExpression(SM, End))
+    return 0;
+
+  if (SM.getSym()) {
+    Error(Start, "cannot use more than one symbol in memory operand");
+    return 0;
+  }
+  if (SM.getBaseReg()) {
+    Error(Start, "cannot use base register with variable reference");
+    return 0;
+  }
+  if (SM.getIndexReg()) {
+    Error(Start, "cannot use index register with variable reference");
+    return 0;
+  }
+
+  const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext());
+  // BaseReg is non-zero to avoid assertions.  In the context of inline asm,
+  // we're pointing to a local variable in memory, so the base register is
+  // really the frame or stack pointer.
+  return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/1, /*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier,
+                               Info.OpDecl);
 }
 
 /// Parse the '.' operator.
@@ -1532,8 +1385,10 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   else
     return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
 
-  // Drop the '.'.
-  StringRef DotDispStr = Tok.getString().drop_front(1);
+  // Drop the optional '.'.
+  StringRef DotDispStr = Tok.getString();
+  if (DotDispStr.startswith("."))
+    DotDispStr = DotDispStr.drop_front(1);
 
   // .Imm gets lexed as a real.
   if (Tok.is(AsmToken::Real)) {
@@ -1583,7 +1438,8 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   // The offset operator will have an 'r' constraint, thus we need to create
   // register operand to ensure proper matching.  Just pick a GPR based on
   // the size of a pointer.
-  unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+  unsigned RegNo =
+      is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
   return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
                                OffsetOfLoc, Identifier, Info.OpDecl);
 }
@@ -1680,6 +1536,13 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
     }
 
     if (getLexer().isNot(AsmToken::LBrac)) {
+      // If a directional label (ie. 1f or 2b) was parsed above from
+      // ParseIntelExpression() then SM.getSym() was set to a pointer to
+      // to the MCExpr with the directional local symbol and this is a
+      // memory operand not an immediate operand.
+      if (SM.getSym())
+        return X86Operand::CreateMem(SM.getSym(), Start, End, Size);
+
       const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext());
       return X86Operand::CreateImm(ImmExpr, Start, End);
     }
@@ -1744,6 +1607,73 @@ X86Operand *X86AsmParser::ParseATTOperand() {
   }
 }
 
+bool
+X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                  const MCParsedAsmOperand &Op) {
+  if(STI.getFeatureBits() & X86::FeatureAVX512) {
+    if (getLexer().is(AsmToken::LCurly)) {
+      // Eat "{" and mark the current place.
+      const SMLoc consumedToken = consumeToken();
+      // Distinguish {1to<NUM>} from {%k<NUM>}.
+      if(getLexer().is(AsmToken::Integer)) {
+        // Parse memory broadcasting ({1to<NUM>}).
+        if (getLexer().getTok().getIntVal() != 1)
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected 1to<NUM> at this point");
+        Parser.Lex();  // Eat "1" of 1to8
+        if (!getLexer().is(AsmToken::Identifier) ||
+            !getLexer().getTok().getIdentifier().startswith("to"))
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected 1to<NUM> at this point");
+        // Recognize only reasonable suffixes.
+        const char *BroadcastPrimitive =
+          StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+            .Case("to8",  "{1to8}")
+            .Case("to16", "{1to16}")
+            .Default(0);
+        if (!BroadcastPrimitive)
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Invalid memory broadcast primitive.");
+        Parser.Lex();  // Eat "toN" of 1toN
+        if (!getLexer().is(AsmToken::RCurly))
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected } at this point");
+        Parser.Lex();  // Eat "}"
+        Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+                                                   consumedToken));
+        // No AVX512 specific primitives can pass
+        // after memory broadcasting, so return.
+        return true;
+      } else {
+        // Parse mask register {%k1}
+        Operands.push_back(X86Operand::CreateToken("{", consumedToken));
+        if (X86Operand *Op = ParseOperand()) {
+          Operands.push_back(Op);
+          if (!getLexer().is(AsmToken::RCurly))
+            return !ErrorAndEatStatement(getLexer().getLoc(),
+                                         "Expected } at this point");
+          Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+
+          // Parse "zeroing non-masked" semantic {z}
+          if (getLexer().is(AsmToken::LCurly)) {
+            Operands.push_back(X86Operand::CreateToken("{z}", consumeToken()));
+            if (!getLexer().is(AsmToken::Identifier) ||
+                getLexer().getTok().getIdentifier() != "z")
+              return !ErrorAndEatStatement(getLexer().getLoc(),
+                                           "Expected z at this point");
+            Parser.Lex();  // Eat the z
+            if (!getLexer().is(AsmToken::RCurly))
+              return !ErrorAndEatStatement(getLexer().getLoc(),
+                                           "Expected } at this point");
+            Parser.Lex();  // Eat the }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
 /// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
 /// has already been parsed if present.
 X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
@@ -1801,10 +1731,11 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   // If we reached here, then we just ate the ( of the memory operand.  Process
   // the rest of the memory operand.
   unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
-  SMLoc IndexLoc;
+  SMLoc IndexLoc, BaseLoc;
 
   if (getLexer().is(AsmToken::Percent)) {
     SMLoc StartLoc, EndLoc;
+    BaseLoc = Parser.getTok().getLoc();
     if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0;
     if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
       Error(StartLoc, "eiz and riz can only be used as index registers",
@@ -1847,6 +1778,11 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           }
 
           // Validate the scale amount.
+	  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+              ScaleVal != 1) {
+            Error(Loc, "scale factor in 16-bit address must be 1");
+            return 0;
+	  }
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
             Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
             return 0;
@@ -1877,24 +1813,26 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   SMLoc MemEnd = Parser.getTok().getEndLoc();
   Parser.Lex(); // Eat the ')'.
 
-  // If we have both a base register and an index register make sure they are
-  // both 64-bit or 32-bit registers.
-  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
-  if (BaseReg != 0 && IndexReg != 0) {
-    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
-        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
-         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
-        IndexReg != X86::RIZ) {
-      Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
-      return 0;
-    }
-    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
-        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
-         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
-        IndexReg != X86::EIZ){
-      Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
-      return 0;
-    }
+  // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+  // and then only in non-64-bit modes. Except for DX, which is a special case
+  // because an unofficial form of in/out instructions uses it.
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+      (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
+                         BaseReg != X86::SI && BaseReg != X86::DI)) &&
+      BaseReg != X86::DX) {
+    Error(BaseLoc, "invalid 16-bit base register");
+    return 0;
+  }
+  if (BaseReg == 0 &&
+      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+    Error(IndexLoc, "16-bit memory operand may not include only index register");
+    return 0;
+  }
+
+  StringRef ErrMsg;
+  if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+    Error(BaseLoc, ErrMsg);
+    return 0;
   }
 
   return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
@@ -1991,84 +1929,35 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
 
     // Parse '*' modifier.
-    if (getLexer().is(AsmToken::Star)) {
-      SMLoc Loc = Parser.getTok().getLoc();
-      Operands.push_back(X86Operand::CreateToken("*", Loc));
-      Parser.Lex(); // Eat the star.
-    }
-
-    // Read the first operand.
-    if (X86Operand *Op = ParseOperand())
-      Operands.push_back(Op);
-    else {
-      Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex();  // Eat the comma.
-
-      // Parse and remember the operand.
-      if (X86Operand *Op = ParseOperand())
-        Operands.push_back(Op);
-      else {
-        Parser.eatToEndOfStatement();
-        return true;
-      }
-    }
-
-    if (STI.getFeatureBits() & X86::FeatureAVX512) {
-      // Parse mask register {%k1}
-      if (getLexer().is(AsmToken::LCurly)) {
-        SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(X86Operand::CreateToken("{", Loc));
-        Parser.Lex();  // Eat the {
-        if (X86Operand *Op = ParseOperand()) {
-          Operands.push_back(Op);
-          if (!getLexer().is(AsmToken::RCurly)) {
-            SMLoc Loc = getLexer().getLoc();
-            Parser.eatToEndOfStatement();
-            return Error(Loc, "Expected } at this point");
-          }
-          Loc = Parser.getTok().getLoc();
-          Operands.push_back(X86Operand::CreateToken("}", Loc));
-          Parser.Lex();  // Eat the }
-        } else {
-          Parser.eatToEndOfStatement();
+    if (getLexer().is(AsmToken::Star))
+      Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+    // Read the operands.
+    while(1) {
+      if (X86Operand *Op = ParseOperand()) {
+         Operands.push_back(Op);
+        if (!HandleAVX512Operand(Operands, *Op))
           return true;
-        }
-      }
-      // Parse "zeroing non-masked" semantic {z}
-      if (getLexer().is(AsmToken::LCurly)) {
-        SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(X86Operand::CreateToken("{z}", Loc));
-        Parser.Lex();  // Eat the {
-        if (!getLexer().is(AsmToken::Identifier) || getLexer().getTok().getIdentifier() != "z") {
-          SMLoc Loc = getLexer().getLoc();
-          Parser.eatToEndOfStatement();
-          return Error(Loc, "Expected z at this point");
-        }
-        Parser.Lex();  // Eat the z
-        if (!getLexer().is(AsmToken::RCurly)) {
-            SMLoc Loc = getLexer().getLoc();
-            Parser.eatToEndOfStatement();
-            return Error(Loc, "Expected } at this point");
-        }
-        Parser.Lex();  // Eat the }
+      } else {
+         Parser.eatToEndOfStatement();
+         return true;
       }
-    }
+      // check for comma and eat it
+      if (getLexer().is(AsmToken::Comma))
+        Parser.Lex();
+      else
+        break;
+     }
 
-    if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      SMLoc Loc = getLexer().getLoc();
-      Parser.eatToEndOfStatement();
-      return Error(Loc, "unexpected token in argument list");
-    }
-  }
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return ErrorAndEatStatement(getLexer().getLoc(),
+                                  "unexpected token in argument list");
+   }
 
-  if (getLexer().is(AsmToken::EndOfStatement))
-    Parser.Lex(); // Consume the EndOfStatement
-  else if (isPrefix && getLexer().is(AsmToken::Slash))
-    Parser.Lex(); // Consume the prefix separator Slash
+  // Consume the EndOfStatement or the prefix separator Slash
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      (isPrefix && getLexer().is(AsmToken::Slash)))
+    Parser.Lex();
 
   if (ExtraImmOp && isParsingIntelSyntax())
     Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
@@ -2101,103 +1990,101 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       delete &Op;
     }
   }
-  // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]"
-  if (Name.startswith("ins") && Operands.size() == 3 &&
-      (Name == "insb" || Name == "insw" || Name == "insl")) {
-    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
-    if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) {
-      Operands.pop_back();
-      Operands.pop_back();
-      delete &Op;
-      delete &Op2;
-    }
-  }
 
-  // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]"
-  if (Name.startswith("outs") && Operands.size() == 3 &&
-      (Name == "outsb" || Name == "outsw" || Name == "outsl")) {
-    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
-    if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) {
-      Operands.pop_back();
-      Operands.pop_back();
-      delete &Op;
-      delete &Op2;
+  // Append default arguments to "ins[bwld]"
+  if (Name.startswith("ins") && Operands.size() == 1 &&
+      (Name == "insb" || Name == "insw" || Name == "insl" ||
+       Name == "insd" )) {
+    if (isParsingIntelSyntax()) {
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+      Operands.push_back(DefaultMemDIOperand(NameLoc));
+    } else {
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+      Operands.push_back(DefaultMemDIOperand(NameLoc));
     }
   }
 
-  // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]"
-  if (Name.startswith("movs") && Operands.size() == 3 &&
-      (Name == "movsb" || Name == "movsw" || Name == "movsl" ||
-       (is64BitMode() && Name == "movsq"))) {
-    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
-    if (isSrcOp(Op) && isDstOp(Op2)) {
-      Operands.pop_back();
-      Operands.pop_back();
-      delete &Op;
-      delete &Op2;
+  // Append default arguments to "outs[bwld]"
+  if (Name.startswith("outs") && Operands.size() == 1 &&
+      (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+       Name == "outsd" )) {
+    if (isParsingIntelSyntax()) {
+      Operands.push_back(DefaultMemSIOperand(NameLoc));
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+    } else {
+      Operands.push_back(DefaultMemSIOperand(NameLoc));
+      Operands.push_back(X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
     }
   }
-  // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]"
-  if (Name.startswith("lods") && Operands.size() == 3 &&
+
+  // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+  // values of $SIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("lods") && Operands.size() == 1 &&
       (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
-       Name == "lodsl" || (is64BitMode() && Name == "lodsq"))) {
-    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
-    X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
-    if (isSrcOp(*Op1) && Op2->isReg()) {
-      const char *ins;
-      unsigned reg = Op2->getReg();
-      bool isLods = Name == "lods";
-      if (reg == X86::AL && (isLods || Name == "lodsb"))
-        ins = "lodsb";
-      else if (reg == X86::AX && (isLods || Name == "lodsw"))
-        ins = "lodsw";
-      else if (reg == X86::EAX && (isLods || Name == "lodsl"))
-        ins = "lodsl";
-      else if (reg == X86::RAX && (isLods || Name == "lodsq"))
-        ins = "lodsq";
-      else
-        ins = NULL;
-      if (ins != NULL) {
-        Operands.pop_back();
-        Operands.pop_back();
-        delete Op1;
-        delete Op2;
-        if (Name != ins)
-          static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
-      }
-    }
-  }
-  // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]"
-  if (Name.startswith("stos") && Operands.size() == 3 &&
+       Name == "lodsl" || Name == "lodsd" || Name == "lodsq"))
+    Operands.push_back(DefaultMemSIOperand(NameLoc));
+
+  // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("stos") && Operands.size() == 1 &&
       (Name == "stos" || Name == "stosb" || Name == "stosw" ||
-       Name == "stosl" || (is64BitMode() && Name == "stosq"))) {
-    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
-    X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
-    if (isDstOp(*Op2) && Op1->isReg()) {
-      const char *ins;
-      unsigned reg = Op1->getReg();
-      bool isStos = Name == "stos";
-      if (reg == X86::AL && (isStos || Name == "stosb"))
-        ins = "stosb";
-      else if (reg == X86::AX && (isStos || Name == "stosw"))
-        ins = "stosw";
-      else if (reg == X86::EAX && (isStos || Name == "stosl"))
-        ins = "stosl";
-      else if (reg == X86::RAX && (isStos || Name == "stosq"))
-        ins = "stosq";
-      else
-        ins = NULL;
-      if (ins != NULL) {
-        Operands.pop_back();
-        Operands.pop_back();
-        delete Op1;
-        delete Op2;
-        if (Name != ins)
-          static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+       Name == "stosl" || Name == "stosd" || Name == "stosq"))
+    Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+  // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("scas") && Operands.size() == 1 &&
+      (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+       Name == "scasl" || Name == "scasd" || Name == "scasq"))
+    Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+  // Add default SI and DI operands to "cmps[bwlq]".
+  if (Name.startswith("cmps") &&
+      (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+       Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+    if (Operands.size() == 1) {
+      if (isParsingIntelSyntax()) {
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+      } else {
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
       }
+    } else if (Operands.size() == 3) {
+      X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+      X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+      if (!doSrcDstMatch(Op, Op2))
+        return Error(Op.getStartLoc(),
+                     "mismatching source and destination index registers");
+    }
+  }
+
+  // Add default SI and DI operands to "movs[bwlq]".
+  if ((Name.startswith("movs") &&
+      (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+       Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+      (Name.startswith("smov") &&
+      (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+       Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
+    if (Operands.size() == 1) {
+      if (Name == "movsd")
+        Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+      if (isParsingIntelSyntax()) {
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+      } else {
+        Operands.push_back(DefaultMemSIOperand(NameLoc));
+        Operands.push_back(DefaultMemDIOperand(NameLoc));
+      }
+    } else if (Operands.size() == 3) {
+      X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+      X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+      if (!doSrcDstMatch(Op, Op2))
+        return Error(Op.getStartLoc(),
+                     "mismatching source and destination index registers");
     }
   }
 
@@ -2362,6 +2249,14 @@ processInstruction(MCInst &Inst,
 }
 
 static const char *getSubtargetFeatureName(unsigned Val);
+
+void X86AsmParser::EmitInstruction(
+    MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+    MCStreamer &Out) {
+  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), Out);
+  Out.EmitInstruction(Inst, STI);
+}
+
 bool X86AsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -2384,7 +2279,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      Out.EmitInstruction(Inst);
+      EmitInstruction(Inst, Operands, Out);
 
     const char *Repl =
       StringSwitch<const char*>(Op->getToken())
@@ -2420,7 +2315,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      Out.EmitInstruction(Inst);
+      EmitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
   case Match_MissingFeature: {
@@ -2507,7 +2402,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   if (NumSuccessfulMatches == 1) {
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      Out.EmitInstruction(Inst);
+      EmitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
   }
@@ -2614,11 +2509,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      if(Parser.getTok().getString() == "noprefix") {
-        // FIXME : Handle noprefix
+      // FIXME: Handle noprefix
+      if (Parser.getTok().getString() == "noprefix")
         Parser.Lex();
-      } else
-        return true;
     }
     return false;
   }
@@ -2632,7 +2525,7 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
     for (;;) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
-        return true;
+        return false;
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -2640,8 +2533,10 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
       Parser.Lex();
     }
   }
@@ -2651,22 +2546,29 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 }
 
 /// ParseDirectiveCode
-///  ::= .code32 | .code64
+///  ::= .code16 | .code32 | .code64
 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
-  if (IDVal == ".code32") {
+  if (IDVal == ".code16") {
     Parser.Lex();
-    if (is64BitMode()) {
-      SwitchMode();
+    if (!is16BitMode()) {
+      SwitchMode(X86::Mode16Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+    }
+  } else if (IDVal == ".code32") {
+    Parser.Lex();
+    if (!is32BitMode()) {
+      SwitchMode(X86::Mode32Bit);
       getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
     }
   } else if (IDVal == ".code64") {
     Parser.Lex();
     if (!is64BitMode()) {
-      SwitchMode();
+      SwitchMode(X86::Mode64Bit);
       getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
     }
   } else {
-    return Error(L, "unexpected directive " + IDVal);
+    Error(L, "unknown directive " + IDVal);
+    return false;
   }
 
   return false;
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 0000000..ef1565f
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,43 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ASM_PARSER_COMMON_H
+#define X86_ASM_PARSER_COMMON_H
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+inline bool isImmZExtu32u8Value(uint64_t Value) {
+    return (Value <= 0x00000000000000FFULL);
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000007FFFFFFFULL)||
+          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+} // End of namespace llvm
+
+#endif // X86_ASM_PARSER_COMMON_H
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 0000000..45fe2a9
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,488 @@
+//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_OPERAND_H
+#define X86_OPERAND_H
+
+#include "X86AsmParserCommon.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
+  StringRef SymName;
+  void *OpDecl;
+  bool AddressOf;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned SegReg;
+    const MCExpr *Disp;
+    unsigned BaseReg;
+    unsigned IndexReg;
+    unsigned Scale;
+    unsigned Size;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  X86Operand(KindTy K, SMLoc Start, SMLoc End)
+    : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+  StringRef getSymName() override { return SymName; }
+  void *getOpDecl() override { return OpDecl; }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+  void print(raw_ostream &OS) const override {}
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  void setTokenValue(StringRef Value) {
+    assert(Kind == Token && "Invalid access!");
+    Tok.Data = Value.data();
+    Tok.Length = Value.size();
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+
+  bool isToken() const override {return Kind == Token; }
+
+  bool isImm() const override { return Kind == Immediate; }
+
+  bool isImmSExti16i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti16i8Value(CE->getValue());
+  }
+  bool isImmSExti32i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti32i8Value(CE->getValue());
+  }
+  bool isImmZExtu32u8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmZExtu32u8Value(CE->getValue());
+  }
+  bool isImmSExti64i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i8Value(CE->getValue());
+  }
+  bool isImmSExti64i32() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i32Value(CE->getValue());
+  }
+
+  bool isOffsetOf() const override {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needAddressOf() const override {
+    return AddressOf;
+  }
+
+  bool isMem() const override { return Kind == Memory; }
+  bool isMem8() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMem16() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMem32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMem64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMem80() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+  }
+  bool isMem128() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+  }
+  bool isMem256() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+  }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
+
+  bool isMemVX32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVX64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVZ32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+  bool isMemVZ64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+
+  bool isAbsMem() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1;
+  }
+
+  bool isSrcIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+       getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isSrcIdx8() const {
+    return isMem8() && isSrcIdx();
+  }
+  bool isSrcIdx16() const {
+    return isMem16() && isSrcIdx();
+  }
+  bool isSrcIdx32() const {
+    return isMem32() && isSrcIdx();
+  }
+  bool isSrcIdx64() const {
+    return isMem64() && isSrcIdx();
+  }
+
+  bool isDstIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+      (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+       getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isDstIdx8() const {
+    return isMem8() && isDstIdx();
+  }
+  bool isDstIdx16() const {
+    return isMem16() && isDstIdx();
+  }
+  bool isDstIdx32() const {
+    return isMem32() && isDstIdx();
+  }
+  bool isDstIdx64() const {
+    return isMem64() && isDstIdx();
+  }
+
+  bool isMemOffs8() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64() const {
+    return Kind == Memory && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
+  }
+
+  bool isReg() const override { return Kind == Register; }
+
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
+  }
+
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::CreateReg(RegNo));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 5) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+    addExpr(Inst, getMemDisp());
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
+  void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+  void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+  }
+
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+    SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+    X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+                               bool AddressOf = false,
+                               SMLoc OffsetOfLoc = SMLoc(),
+                               StringRef SymName = StringRef(),
+                               void *OpDecl = 0) {
+    X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+    Res->Reg.RegNo = RegNo;
+    Res->AddressOf = AddressOf;
+    Res->OffsetOfLoc = OffsetOfLoc;
+    Res->SymName = SymName;
+    Res->OpDecl = OpDecl;
+    return Res;
+  }
+
+  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+    X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+    Res->Imm.Val = Val;
+    return Res;
+  }
+
+  /// Create an absolute memory operand.
+  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0, StringRef SymName = StringRef(),
+                               void *OpDecl = 0) {
+    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = 0;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = 0;
+    Res->Mem.IndexReg = 0;
+    Res->Mem.Scale    = 1;
+    Res->Mem.Size     = Size;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+
+  /// Create a generalized memory operand.
+  static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
+                               unsigned BaseReg, unsigned IndexReg,
+                               unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0,
+                               StringRef SymName = StringRef(),
+                               void *OpDecl = 0) {
+    // We should never just have a displacement, that should be parsed as an
+    // absolute memory operand.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = SegReg;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.IndexReg = IndexReg;
+    Res->Mem.Scale    = Scale;
+    Res->Mem.Size     = Size;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+};
+
+} // End of namespace llvm
+
+#endif // X86_OPERAND
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 7e20151..206b651 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -37,11 +37,6 @@ set(sources
   )
 
 if( CMAKE_CL_64 )
-  # A workaround for a bug in cmake 2.8.3. See PR 8885.
-  if( CMAKE_VERSION STREQUAL "2.8.3" )
-    include(CMakeDetermineCompilerId)
-  endif()
-  # end of workaround.
   enable_language(ASM_MASM)
   ADD_CUSTOM_COMMAND(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
@@ -53,8 +48,6 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
-add_dependencies(LLVMX86CodeGen X86CommonTableGen intrinsics_gen)
-
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/Android.mk b/lib/Target/X86/Disassembler/Android.mk
index 67400ad..3984266 100644
--- a/lib/Target/X86/Disassembler/Android.mk
+++ b/lib/Target/X86/Disassembler/Android.mk
@@ -3,7 +3,8 @@ LOCAL_PATH := $(call my-dir)
 x86_disassembler_TBLGEN_TABLES := \
   X86GenDisassemblerTables.inc \
   X86GenInstrInfo.inc \
-  X86GenRegisterInfo.inc
+  X86GenRegisterInfo.inc \
+  X86GenSubtargetInfo.inc
 
 x86_disassembler_SRC_FILES := \
   X86Disassembler.cpp \
@@ -11,6 +12,7 @@ x86_disassembler_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -29,6 +31,7 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index 0cd6db9..deed115 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -1,16 +1,4 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMX86Disassembler
   X86Disassembler.cpp
   X86DisassemblerDecoder.c
   )
-
-# workaround for hanging compilation on MSVC9 and 10
-if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-set_property(
-  SOURCE X86Disassembler.cpp
-  PROPERTY COMPILE_FLAGS "/Od"
-  )
-endif()
-
-add_dependencies(LLVMX86Disassembler X86CommonTableGen)
diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt
index 0609f3c..cac7adf 100644
--- a/lib/Target/X86/Disassembler/LLVMBuild.txt
+++ b/lib/Target/X86/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = X86Disassembler
 parent = X86
-required_libraries = MC Support X86Desc X86Info
+required_libraries = MC Support X86Info
 add_to_library_groups = X86
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 903e36c..d5759cd 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -31,6 +31,8 @@
 #include "X86GenRegisterInfo.inc"
 #define GET_INSTRINFO_ENUM
 #include "X86GenInstrInfo.inc"
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
 
 using namespace llvm;
 using namespace llvm::X86Disassembler;
@@ -73,9 +75,23 @@ static bool translateInstruction(MCInst &target,
                                 const MCDisassembler *Dis);
 
 X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
-                                               DisassemblerMode mode,
                                                const MCInstrInfo *MII)
-  : MCDisassembler(STI), MII(MII), fMode(mode) {}
+  : MCDisassembler(STI), MII(MII) {
+  switch (STI.getFeatureBits() &
+          (X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) {
+  case X86::Mode16Bit:
+    fMode = MODE_16BIT;
+    break;
+  case X86::Mode32Bit:
+    fMode = MODE_32BIT;
+    break;
+  case X86::Mode64Bit:
+    fMode = MODE_64BIT;
+    break;
+  default:
+    llvm_unreachable("Invalid CPU mode");
+  }
+}
 
 X86GenericDisassembler::~X86GenericDisassembler() {
   delete MII;
@@ -207,6 +223,61 @@ static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
   Dis->tryAddingPcLoadReferenceComment(Value, Address);
 }
 
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+  0,        // SEG_OVERRIDE_NONE
+  X86::CS,
+  X86::SS,
+  X86::DS,
+  X86::ES,
+  X86::FS,
+  X86::GS
+};
+
+/// translateSrcIndex   - Appends a source index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+  }
+  MCOperand baseReg = MCOperand::CreateReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+
+  MCOperand segmentReg;
+  segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateDstIndex   - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+  }
+  MCOperand baseReg = MCOperand::CreateReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+  return false;
+}
+
 /// translateImmediate  - Appends an immediate operand to an MCInst.
 ///
 /// @param mcInst       - The MCInst to append to.
@@ -315,6 +386,13 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
                                insn.immediateOffset, insn.immediateSize,
                                mcInst, Dis))
     mcInst.addOperand(MCOperand::CreateImm(immediate));
+
+  if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
+      type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+    MCOperand segmentReg;
+    segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+    mcInst.addOperand(segmentReg);
+  }
 }
 
 /// translateRMRegister - Translates a register stored in the R/M field of the
@@ -418,13 +496,22 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
     bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
                        Opcode == X86::VGATHERDPSYrm ||
                        Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VGATHERDPDZrm ||
+                       Opcode == X86::VPGATHERDQZrm ||
                        Opcode == X86::VPGATHERQQYrm ||
                        Opcode == X86::VPGATHERDDYrm ||
                        Opcode == X86::VPGATHERQDYrm);
-    if (IndexIs128 || IndexIs256) {
+    bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
+                       Opcode == X86::VGATHERDPSZrm ||
+                       Opcode == X86::VGATHERQPSZrm ||
+                       Opcode == X86::VPGATHERQQZrm ||
+                       Opcode == X86::VPGATHERDDZrm ||
+                       Opcode == X86::VPGATHERQDZrm);
+    if (IndexIs128 || IndexIs256 || IndexIs512) {
       unsigned IndexOffset = insn.sibIndex -
                          (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
-      SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
+                           IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
       insn.sibIndex = (SIBIndex)(IndexBase + 
                            (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
     }
@@ -513,17 +600,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
   }
   
   displacement = MCOperand::CreateImm(insn.displacement);
-  
-  static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
-    0,        // SEG_OVERRIDE_NONE
-    X86::CS,
-    X86::SS,
-    X86::DS,
-    X86::ES,
-    X86::FS,
-    X86::GS
-  };
-  
+
   segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
   
   mcInst.addOperand(baseReg);
@@ -565,6 +642,9 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_XMM128:
   case TYPE_XMM256:
   case TYPE_XMM512:
+  case TYPE_VK1:
+  case TYPE_VK8:
+  case TYPE_VK16:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
     return translateRMRegister(mcInst, insn);
@@ -596,16 +676,25 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
 ///
 /// @param mcInst       - The MCInst to append to.
 /// @param stackPos     - The stack position to translate.
-/// @return             - 0 on success; nonzero otherwise.
-static bool translateFPRegister(MCInst &mcInst,
-                               uint8_t stackPos) {
-  if (stackPos >= 8) {
-    debug("Invalid FP stack position");
+static void translateFPRegister(MCInst &mcInst,
+                                uint8_t stackPos) {
+  mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param maskRegNum   - Number of mask register from 0 to 7.
+/// @return             - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+                                uint8_t maskRegNum) {
+  if (maskRegNum >= 8) {
+    debug("Invalid mask register number");
     return true;
   }
-  
-  mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
 
+  mcInst.addOperand(MCOperand::CreateReg(X86::K0 + maskRegNum));
   return false;
 }
 
@@ -626,6 +715,8 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
   case ENCODING_REG:
     translateRegister(mcInst, insn.reg);
     return false;
+  case ENCODING_WRITEMASK:
+    return translateMaskRegister(mcInst, insn.writemask);
   case ENCODING_RM:
     return translateRM(mcInst, operand, insn, Dis);
   case ENCODING_CB:
@@ -648,17 +739,20 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
                        insn,
                        Dis);
     return false;
+  case ENCODING_SI:
+    return translateSrcIndex(mcInst, insn);
+  case ENCODING_DI:
+    return translateDstIndex(mcInst, insn);
   case ENCODING_RB:
   case ENCODING_RW:
   case ENCODING_RD:
   case ENCODING_RO:
-    translateRegister(mcInst, insn.opcodeRegister);
-    return false;
-  case ENCODING_I:
-    return translateFPRegister(mcInst, insn.opcodeModifier);
   case ENCODING_Rv:
     translateRegister(mcInst, insn.opcodeRegister);
     return false;
+  case ENCODING_FP:
+    translateFPRegister(mcInst, insn.modRM & 7);
+    return false;
   case ENCODING_VVVV:
     translateRegister(mcInst, insn.vvvv);
     return false;
@@ -708,22 +802,16 @@ static bool translateInstruction(MCInst &mcInst,
   return false;
 }
 
-static MCDisassembler *createX86_32Disassembler(const Target &T,
-                                                const MCSubtargetInfo &STI) {
-  return new X86Disassembler::X86GenericDisassembler(STI, MODE_32BIT,
-                                                     T.createMCInstrInfo());
-}
-
-static MCDisassembler *createX86_64Disassembler(const Target &T,
-                                                const MCSubtargetInfo &STI) {
-  return new X86Disassembler::X86GenericDisassembler(STI, MODE_64BIT,
+static MCDisassembler *createX86Disassembler(const Target &T,
+                                             const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86GenericDisassembler(STI,
                                                      T.createMCInstrInfo());
 }
 
 extern "C" void LLVMInitializeX86Disassembler() { 
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(TheX86_32Target, 
-                                         createX86_32Disassembler);
+                                         createX86Disassembler);
   TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
-                                         createX86_64Disassembler);
+                                         createX86Disassembler);
 }
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index b92427a..4e6e297 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -105,20 +105,16 @@ class X86GenericDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  /// @param mode     - The X86 architecture mode to decode for.
-  X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode,
-                         const MCInstrInfo *MII);
+  X86GenericDisassembler(const MCSubtargetInfo &STI, const MCInstrInfo *MII);
 private:
   ~X86GenericDisassembler();
 public:
 
   /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              const MemoryObject &region, uint64_t address,
                               raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+                              raw_ostream &cStream) const override;
 
 private:
   DisassemblerMode              fMode;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index c81a857..0801c96 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -40,7 +40,7 @@
  * @return          - The InstructionContext to use when looking up an
  *                    an instruction with these attributes.
  */
-static InstructionContext contextForAttrs(uint8_t attrMask) {
+static InstructionContext contextForAttrs(uint16_t attrMask) {
   return CONTEXTS_SYM[attrMask];
 }
 
@@ -57,7 +57,7 @@ static InstructionContext contextForAttrs(uint8_t attrMask) {
  */
 static int modRMRequired(OpcodeType type,
                          InstructionContext insnContext,
-                         uint8_t opcode) {
+                         uint16_t opcode) {
   const struct ContextDecision* decision = 0;
 
   switch (type) {
@@ -73,12 +73,6 @@ static int modRMRequired(OpcodeType type,
   case THREEBYTE_3A:
     decision = &THREEBYTE3A_SYM;
     break;
-  case THREEBYTE_A6:
-    decision = &THREEBYTEA6_SYM;
-    break;
-  case THREEBYTE_A7:
-    decision = &THREEBYTEA7_SYM;
-    break;
   case XOP8_MAP:
     decision = &XOP8_MAP_SYM;
     break;
@@ -123,12 +117,6 @@ static InstrUID decode(OpcodeType type,
   case THREEBYTE_3A:
     dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
-  case THREEBYTE_A6:
-    dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case THREEBYTE_A7:
-    dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
   case XOP8_MAP:
     dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
@@ -444,9 +432,58 @@ static int readPrefixes(struct InternalInstruction* insn) {
       dbgprintf(insn, "Found prefix 0x%hhx", byte);
   }
 
-  insn->vexXopType = TYPE_NO_VEX_XOP;
+  insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+  if (byte == 0x62) {
+    uint8_t byte1, byte2;
+
+    if (consumeByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+      return -1;
+    }
+
+    if (lookAtByte(insn, &byte2)) {
+      dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+      return -1;
+    }
+
+    if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+       ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+      insn->vectorExtensionType = TYPE_EVEX;
+    }
+    else {
+      unconsumeByte(insn); /* unconsume byte1 */
+      unconsumeByte(insn); /* unconsume byte  */
+      insn->necessaryPrefixLocation = insn->readerCursor - 2;
+    }
 
-  if (byte == 0xc4) {
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      insn->vectorExtensionPrefix[0] = byte;
+      insn->vectorExtensionPrefix[1] = byte1;
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+        dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+        return -1;
+      }
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+        dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+        return -1;
+      }
+
+      /* We simulate the REX prefix for simplicity's sake */
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+              insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+              insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+    }
+  }
+  else if (byte == 0xc4) {
     uint8_t byte1;
 
     if (lookAtByte(insn, &byte1)) {
@@ -455,7 +492,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexXopType = TYPE_VEX_3B;
+      insn->vectorExtensionType = TYPE_VEX_3B;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
     else {
@@ -463,33 +500,24 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
 
-    if (insn->vexXopType == TYPE_VEX_3B) {
-      insn->vexXopPrefix[0] = byte;
-      consumeByte(insn, &insn->vexXopPrefix[1]);
-      consumeByte(insn, &insn->vexXopPrefix[2]);
+    if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (wFromVEX3of3(insn->vexXopPrefix[2]) << 3)
-                        | (rFromVEX2of3(insn->vexXopPrefix[1]) << 2)
-                        | (xFromVEX2of3(insn->vexXopPrefix[1]) << 1)
-                        | (bFromVEX2of3(insn->vexXopPrefix[1]) << 0);
-      }
-
-      switch (ppFromVEX3of3(insn->vexXopPrefix[2]))
-      {
-      default:
-        break;
-      case VEX_PREFIX_66:
-        hasOpSize = TRUE;
-        break;
+                        | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
       }
 
       dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
-                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
-                insn->vexXopPrefix[2]);
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
     }
   }
   else if (byte == 0xc5) {
@@ -501,22 +529,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexXopType = TYPE_VEX_2B;
+      insn->vectorExtensionType = TYPE_VEX_2B;
     }
     else {
       unconsumeByte(insn);
     }
 
-    if (insn->vexXopType == TYPE_VEX_2B) {
-      insn->vexXopPrefix[0] = byte;
-      consumeByte(insn, &insn->vexXopPrefix[1]);
+    if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (rFromVEX2of2(insn->vexXopPrefix[1]) << 2);
+                        | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
       }
 
-      switch (ppFromVEX2of2(insn->vexXopPrefix[1]))
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1]))
       {
       default:
         break;
@@ -525,7 +553,9 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexXopPrefix[0], insn->vexXopPrefix[1]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0],
+                insn->vectorExtensionPrefix[1]);
     }
   }
   else if (byte == 0x8f) {
@@ -537,7 +567,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
-      insn->vexXopType = TYPE_XOP;
+      insn->vectorExtensionType = TYPE_XOP;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
     else {
@@ -545,22 +575,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
 
-    if (insn->vexXopType == TYPE_XOP) {
-      insn->vexXopPrefix[0] = byte;
-      consumeByte(insn, &insn->vexXopPrefix[1]);
-      consumeByte(insn, &insn->vexXopPrefix[2]);
+    if (insn->vectorExtensionType == TYPE_XOP) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (wFromXOP3of3(insn->vexXopPrefix[2]) << 3)
-                        | (rFromXOP2of3(insn->vexXopPrefix[1]) << 2)
-                        | (xFromXOP2of3(insn->vexXopPrefix[1]) << 1)
-                        | (bFromXOP2of3(insn->vexXopPrefix[1]) << 0);
+                        | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
       }
 
-      switch (ppFromXOP3of3(insn->vexXopPrefix[2]))
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2]))
       {
       default:
         break;
@@ -570,8 +600,8 @@ static int readPrefixes(struct InternalInstruction* insn) {
       }
 
       dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
-                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
-                insn->vexXopPrefix[2]);
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
     }
   }
   else {
@@ -646,13 +676,29 @@ static int readOpcode(struct InternalInstruction* insn) {
 
   insn->opcodeType = ONEBYTE;
 
-  if (insn->vexXopType == TYPE_VEX_3B)
+  if (insn->vectorExtensionType == TYPE_EVEX)
   {
-    switch (mmmmmFromVEX2of3(insn->vexXopPrefix[1]))
-    {
+    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+                mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+    switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
     default:
       dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
-                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
       return -1;
     case VEX_LOB_0F:
       insn->opcodeType = TWOBYTE;
@@ -665,18 +711,15 @@ static int readOpcode(struct InternalInstruction* insn) {
       return consumeByte(insn, &insn->opcode);
     }
   }
-  else if (insn->vexXopType == TYPE_VEX_2B)
-  {
+  else if (insn->vectorExtensionType == TYPE_VEX_2B) {
     insn->opcodeType = TWOBYTE;
     return consumeByte(insn, &insn->opcode);
   }
-  else if (insn->vexXopType == TYPE_XOP)
-  {
-    switch (mmmmmFromXOP2of3(insn->vexXopPrefix[1]))
-    {
+  else if (insn->vectorExtensionType == TYPE_XOP) {
+    switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
     default:
       dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
-                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
       return -1;
     case XOP_MAP_SELECT_8:
       insn->opcodeType = XOP8_MAP;
@@ -713,20 +756,6 @@ static int readOpcode(struct InternalInstruction* insn) {
         return -1;
 
       insn->opcodeType = THREEBYTE_3A;
-    } else if (current == 0xa6) {
-      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
-      if (consumeByte(insn, &current))
-        return -1;
-
-      insn->opcodeType = THREEBYTE_A6;
-    } else if (current == 0xa7) {
-      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
-      if (consumeByte(insn, &current))
-        return -1;
-
-      insn->opcodeType = THREEBYTE_A7;
     } else {
       dbgprintf(insn, "Didn't find a three-byte escape prefix");
 
@@ -760,10 +789,10 @@ static int readModRM(struct InternalInstruction* insn);
  */
 static int getIDWithAttrMask(uint16_t* instructionID,
                              struct InternalInstruction* insn,
-                             uint8_t attrMask) {
+                             uint16_t attrMask) {
   BOOL hasModRMExtension;
 
-  uint8_t instructionClass;
+  uint16_t instructionClass;
 
   instructionClass = contextForAttrs(attrMask);
 
@@ -826,7 +855,7 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
  *                nonzero otherwise.
  */
 static int getID(struct InternalInstruction* insn, const void *miiArg) {
-  uint8_t attrMask;
+  uint16_t attrMask;
   uint16_t instructionID;
 
   dbgprintf(insn, "getID()");
@@ -836,11 +865,11 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   if (insn->mode == MODE_64BIT)
     attrMask |= ATTR_64BIT;
 
-  if (insn->vexXopType != TYPE_NO_VEX_XOP) {
-    attrMask |= ATTR_VEX;
+  if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
 
-    if (insn->vexXopType == TYPE_VEX_3B) {
-      switch (ppFromVEX3of3(insn->vexXopPrefix[2])) {
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -852,11 +881,35 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX3of3(insn->vexXopPrefix[2]))
+      if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXKZ;
+      if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXB;
+      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXK;
+      if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL;
+      if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL2;
+    }
+    else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
-    else if (insn->vexXopType == TYPE_VEX_2B) {
-      switch (ppFromVEX2of2(insn->vexXopPrefix[1])) {
+    else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -868,11 +921,11 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX2of2(insn->vexXopPrefix[1]))
+      if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
         attrMask |= ATTR_VEXL;
     }
-    else if (insn->vexXopType == TYPE_XOP) {
-      switch (ppFromXOP3of3(insn->vexXopPrefix[2])) {
+    else if (insn->vectorExtensionType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -884,7 +937,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromXOP3of3(insn->vexXopPrefix[2]))
+      if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
     else {
@@ -892,7 +945,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
     }
   }
   else {
-    if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+    if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
       attrMask |= ATTR_OPSIZE;
     else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
       attrMask |= ATTR_ADSIZE;
@@ -908,9 +961,29 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   if (getIDWithAttrMask(&instructionID, insn, attrMask))
     return -1;
 
+  /*
+   * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+   * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+   */
+  if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) {
+    const struct InstructionSpecifier *spec;
+    spec = specifierForUID(instructionID);
+
+    /*
+     * Check for Ii8PCRel instructions. We could alternatively do a
+     * string-compare on the names, but this is probably cheaper.
+     */
+    if (x86OperandSets[spec->operands][0].type == TYPE_REL8) {
+      attrMask ^= ATTR_ADSIZE;
+      if (getIDWithAttrMask(&instructionID, insn, attrMask))
+        return -1;
+    }
+  }
+
   /* The following clauses compensate for limitations of the tables. */
 
-  if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
+  if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+      !(attrMask & ATTR_OPSIZE)) {
     /*
      * The instruction tables make no distinction between instructions that
      * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
@@ -942,7 +1015,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
     specWithOpSizeName =
       x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
 
-    if (is16BitEquivalent(specName, specWithOpSizeName)) {
+    if (is16BitEquivalent(specName, specWithOpSizeName) &&
+        (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
       insn->instructionID = instructionIDWithOpsize;
       insn->spec = specifierForUID(instructionIDWithOpsize);
     } else {
@@ -1018,7 +1092,6 @@ static int readSIB(struct InternalInstruction* insn) {
   case 2:
     dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
     return -1;
-    break;
   case 4:
     sibIndexBase = SIB_INDEX_EAX;
     sibBaseBase = SIB_BASE_EAX;
@@ -1033,6 +1106,8 @@ static int readSIB(struct InternalInstruction* insn) {
     return -1;
 
   index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
 
   switch (index) {
   case 0x4:
@@ -1065,6 +1140,7 @@ static int readSIB(struct InternalInstruction* insn) {
 
   switch (base) {
   case 0x5:
+  case 0xd:
     switch (modFromModRM(insn->modRM)) {
     case 0x0:
       insn->eaDisplacement = EA_DISP_32;
@@ -1072,13 +1148,11 @@ static int readSIB(struct InternalInstruction* insn) {
       break;
     case 0x1:
       insn->eaDisplacement = EA_DISP_8;
-      insn->sibBase = (insn->addressSize == 4 ?
-                       SIB_BASE_EBP : SIB_BASE_RBP);
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
       break;
     case 0x2:
       insn->eaDisplacement = EA_DISP_32;
-      insn->sibBase = (insn->addressSize == 4 ?
-                       SIB_BASE_EBP : SIB_BASE_RBP);
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
       break;
     case 0x3:
       debug("Cannot have Mod = 0b11 and a SIB byte");
@@ -1183,6 +1257,10 @@ static int readModRM(struct InternalInstruction* insn) {
 
   reg |= rFromREX(insn->rexPrefix) << 3;
   rm  |= bFromREX(insn->rexPrefix) << 3;
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+  }
 
   insn->reg = (Reg)(insn->regBase + reg);
 
@@ -1205,6 +1283,7 @@ static int readModRM(struct InternalInstruction* insn) {
     case 0x1:
       insn->eaBase = (EABase)(insn->eaBaseBase + rm);
       insn->eaDisplacement = EA_DISP_8;
+      insn->displacementSize = 1;
       if (readDisplacement(insn))
         return -1;
       break;
@@ -1229,12 +1308,12 @@ static int readModRM(struct InternalInstruction* insn) {
     case 0x0:
       insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
       switch (rm) {
+      case 0x14:
       case 0x4:
       case 0xc:   /* in case REXW.b is set */
         insn->eaBase = (insn->addressSize == 4 ?
                         EA_BASE_sib : EA_BASE_sib64);
-        readSIB(insn);
-        if (readDisplacement(insn))
+        if (readSIB(insn) || readDisplacement(insn))
           return -1;
         break;
       case 0x5:
@@ -1249,14 +1328,16 @@ static int readModRM(struct InternalInstruction* insn) {
       }
       break;
     case 0x1:
+      insn->displacementSize = 1;
+      /* FALLTHROUGH */
     case 0x2:
       insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
       switch (rm) {
+      case 0x14:
       case 0x4:
       case 0xc:   /* in case REXW.b is set */
         insn->eaBase = EA_BASE_sib;
-        readSIB(insn);
-        if (readDisplacement(insn))
+        if (readSIB(insn) || readDisplacement(insn))
           return -1;
         break;
       default:
@@ -1312,6 +1393,10 @@ static int readModRM(struct InternalInstruction* insn) {
     case TYPE_XMM32:                                      \
     case TYPE_XMM:                                        \
       return prefix##_XMM0 + index;                       \
+    case TYPE_VK1:                                        \
+    case TYPE_VK8:                                        \
+    case TYPE_VK16:                                       \
+      return prefix##_K0 + index;                         \
     case TYPE_MM64:                                       \
     case TYPE_MM32:                                       \
     case TYPE_MM:                                         \
@@ -1400,44 +1485,11 @@ static int fixupReg(struct InternalInstruction *insn,
 }
 
 /*
- * readOpcodeModifier - Reads an operand from the opcode field of an
- *   instruction.  Handles AddRegFrm instructions.
- *
- * @param insn    - The instruction whose opcode field is to be read.
- * @param inModRM - Indicates that the opcode field is to be read from the
- *                  ModR/M extension; useful for escape opcodes
- * @return        - 0 on success; nonzero otherwise.
- */
-static int readOpcodeModifier(struct InternalInstruction* insn) {
-  dbgprintf(insn, "readOpcodeModifier()");
-
-  if (insn->consumedOpcodeModifier)
-    return 0;
-
-  insn->consumedOpcodeModifier = TRUE;
-
-  switch (insn->spec->modifierType) {
-  default:
-    debug("Unknown modifier type.");
-    return -1;
-  case MODIFIER_NONE:
-    debug("No modifier but an operand expects one.");
-    return -1;
-  case MODIFIER_OPCODE:
-    insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
-    return 0;
-  case MODIFIER_MODRM:
-    insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
-    return 0;
-  }
-}
-
-/*
  * readOpcodeRegister - Reads an operand from the opcode field of an
  *   instruction and interprets it appropriately given the operand width.
  *   Handles AddRegFrm instructions.
  *
- * @param insn  - See readOpcodeModifier().
+ * @param insn  - the instruction whose opcode field is to be read.
  * @param size  - The width (in bytes) of the register being specified.
  *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
  *                RAX.
@@ -1446,16 +1498,13 @@ static int readOpcodeModifier(struct InternalInstruction* insn) {
 static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
   dbgprintf(insn, "readOpcodeRegister()");
 
-  if (readOpcodeModifier(insn))
-    return -1;
-
   if (size == 0)
     size = insn->registerSize;
 
   switch (size) {
   case 1:
     insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
-                                                  | insn->opcodeModifier));
+                                                  | (insn->opcode & 7)));
     if (insn->rexPrefix &&
         insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
         insn->opcodeRegister < MODRM_REG_AL + 0x8) {
@@ -1467,17 +1516,17 @@ static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
   case 2:
     insn->opcodeRegister = (Reg)(MODRM_REG_AX
                                  + ((bFromREX(insn->rexPrefix) << 3)
-                                    | insn->opcodeModifier));
+                                    | (insn->opcode & 7)));
     break;
   case 4:
     insn->opcodeRegister = (Reg)(MODRM_REG_EAX
                                  + ((bFromREX(insn->rexPrefix) << 3)
-                                    | insn->opcodeModifier));
+                                    | (insn->opcode & 7)));
     break;
   case 8:
     insn->opcodeRegister = (Reg)(MODRM_REG_RAX
                                  + ((bFromREX(insn->rexPrefix) << 3)
-                                    | insn->opcodeModifier));
+                                    | (insn->opcode & 7)));
     break;
   }
 
@@ -1550,12 +1599,14 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 static int readVVVV(struct InternalInstruction* insn) {
   dbgprintf(insn, "readVVVV()");
 
-  if (insn->vexXopType == TYPE_VEX_3B)
-    insn->vvvv = vvvvFromVEX3of3(insn->vexXopPrefix[2]);
-  else if (insn->vexXopType == TYPE_VEX_2B)
-    insn->vvvv = vvvvFromVEX2of2(insn->vexXopPrefix[1]);
-  else if (insn->vexXopType == TYPE_XOP)
-    insn->vvvv = vvvvFromXOP3of3(insn->vexXopPrefix[2]);
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    insn->vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_3B)
+    insn->vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_2B)
+    insn->vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+  else if (insn->vectorExtensionType == TYPE_XOP)
+    insn->vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
   else
     return -1;
 
@@ -1566,6 +1617,23 @@ static int readVVVV(struct InternalInstruction* insn) {
 }
 
 /*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ *   instruction.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @return        - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readMaskRegister()");
+
+  if (insn->vectorExtensionType != TYPE_EVEX)
+    return -1;
+
+  insn->writemask = aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]);
+  return 0;
+}
+
+/*
  * readOperands - Consults the specifier for an instruction and consumes all
  *   operands for that instruction, interpreting them as it goes.
  *
@@ -1587,6 +1655,8 @@ static int readOperands(struct InternalInstruction* insn) {
   for (index = 0; index < X86_MAX_OPERANDS; ++index) {
     switch (x86OperandSets[insn->spec->operands][index].encoding) {
     case ENCODING_NONE:
+    case ENCODING_SI:
+    case ENCODING_DI:
       break;
     case ENCODING_REG:
     case ENCODING_RM:
@@ -1664,9 +1734,7 @@ static int readOperands(struct InternalInstruction* insn) {
       if (readOpcodeRegister(insn, 0))
         return -1;
       break;
-    case ENCODING_I:
-      if (readOpcodeModifier(insn))
-        return -1;
+    case ENCODING_FP:
       break;
     case ENCODING_VVVV:
       needVVVV = 0; /* Mark that we have found a VVVV operand. */
@@ -1675,6 +1743,10 @@ static int readOperands(struct InternalInstruction* insn) {
       if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
         return -1;
       break;
+    case ENCODING_WRITEMASK:
+      if (readMaskRegister(insn))
+        return -1;
+      break;
     case ENCODING_DUP:
       break;
     default:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 6d03d5c..ac3b39d 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -45,6 +45,21 @@ extern "C" {
 #define xFromREX(rex)        (((rex) & 0x2) >> 1)
 #define bFromREX(rex)        ((rex) & 0x1)
 
+#define rFromEVEX2of4(evex)     (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex)     (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex)     (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex)    (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex)    ((evex) & 0x3)
+#define wFromEVEX3of4(evex)     (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex)  (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex)    ((evex) & 0x3)
+#define zFromEVEX4of4(evex)     (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex)    (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex)     (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex)     (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex)    (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex)   ((evex) & 0x7)
+
 #define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
 #define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
 #define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
@@ -314,6 +329,16 @@ extern "C" {
   ENTRY(ZMM30)    \
   ENTRY(ZMM31)
 
+#define REGS_MASKS \
+  ENTRY(K0)        \
+  ENTRY(K1)        \
+  ENTRY(K2)        \
+  ENTRY(K3)        \
+  ENTRY(K4)        \
+  ENTRY(K5)        \
+  ENTRY(K6)        \
+  ENTRY(K7)
+
 #define REGS_SEGMENT \
   ENTRY(ES)          \
   ENTRY(CS)          \
@@ -361,6 +386,7 @@ extern "C" {
   REGS_XMM            \
   REGS_YMM            \
   REGS_ZMM            \
+  REGS_MASKS          \
   REGS_SEGMENT        \
   REGS_DEBUG          \
   REGS_CONTROL        \
@@ -463,7 +489,7 @@ typedef enum {
 } XOPMapSelect;
 
 /*
- * VEXPrefixCode - Possible values for the VEX.pp field
+ * VEXPrefixCode - Possible values for the VEX.pp/EVEX.pp field
  */
 
 typedef enum {
@@ -474,11 +500,12 @@ typedef enum {
 } VEXPrefixCode;
 
 typedef enum {
-  TYPE_NO_VEX_XOP = 0x0,
-  TYPE_VEX_2B = 0x1,
-  TYPE_VEX_3B = 0x2,
-  TYPE_XOP = 0x3
-} VEXXOPType;
+  TYPE_NO_VEX_XOP   = 0x0,
+  TYPE_VEX_2B       = 0x1,
+  TYPE_VEX_3B       = 0x2,
+  TYPE_EVEX         = 0x3,
+  TYPE_XOP          = 0x4
+} VectorExtensionType;
 
 typedef uint8_t BOOL;
 
@@ -536,10 +563,10 @@ struct InternalInstruction {
   uint8_t prefixPresent[0x100];
   /* contains the location (for use with the reader) of the prefix byte */
   uint64_t prefixLocations[0x100];
-  /* The value of the VEX/XOP prefix, if present */
-  uint8_t vexXopPrefix[3];
-  /* The length of the VEX prefix (0 if not present) */
-  VEXXOPType vexXopType;
+  /* The value of the vector extension prefix(EVEX/VEX/XOP), if present */
+  uint8_t vectorExtensionPrefix[4];
+  /* The type of the vector extension prefix */
+  VectorExtensionType vectorExtensionType;
   /* The value of the REX prefix, if present */
   uint8_t rexPrefix;
   /* The location where a mandatory prefix would have to be (i.e., right before
@@ -585,6 +612,9 @@ struct InternalInstruction {
      instructions */
   Reg                           vvvv;
 
+  /* The writemask for AVX-512 instructions which is contained in EVEX.aaa */
+  Reg                           writemask;
+
   /* The ModR/M byte, which contains most register operands and some portion of
      all memory operands */
   BOOL                          consumedModRM;
@@ -604,8 +634,6 @@ struct InternalInstruction {
   uint64_t                      immediates[2];
 
   /* A register or immediate operand encoded into the opcode */
-  BOOL                          consumedOpcodeModifier;
-  uint8_t                       opcodeModifier;
   Reg                           opcodeRegister;
 
   /* Portions of the ModR/M byte */
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index dd1719c..523ae99 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -30,8 +30,6 @@
 #define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
 #define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
 #define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
-#define THREEBYTEA6_SYM   x86DisassemblerThreeByteA6Opcodes
-#define THREEBYTEA7_SYM   x86DisassemblerThreeByteA7Opcodes
 #define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
 #define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
 #define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
@@ -42,8 +40,6 @@
 #define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
 #define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
 #define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
-#define THREEBYTEA6_STR   "x86DisassemblerThreeByteA6Opcodes"
-#define THREEBYTEA7_STR   "x86DisassemblerThreeByteA7Opcodes"
 #define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
 #define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
 #define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
@@ -53,16 +49,22 @@
  * processed correctly.  Most of these indicate the presence of particular
  * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
  */
-#define ATTRIBUTE_BITS          \
-  ENUM_ENTRY(ATTR_NONE,   0x00) \
-  ENUM_ENTRY(ATTR_64BIT,  0x01) \
-  ENUM_ENTRY(ATTR_XS,     0x02) \
-  ENUM_ENTRY(ATTR_XD,     0x04) \
-  ENUM_ENTRY(ATTR_REXW,   0x08) \
-  ENUM_ENTRY(ATTR_OPSIZE, 0x10) \
-  ENUM_ENTRY(ATTR_ADSIZE, 0x20) \
-  ENUM_ENTRY(ATTR_VEX,    0x40) \
-  ENUM_ENTRY(ATTR_VEXL,   0x80)
+#define ATTRIBUTE_BITS                  \
+  ENUM_ENTRY(ATTR_NONE,   0x00)         \
+  ENUM_ENTRY(ATTR_64BIT,  (0x1 << 0))   \
+  ENUM_ENTRY(ATTR_XS,     (0x1 << 1))   \
+  ENUM_ENTRY(ATTR_XD,     (0x1 << 2))   \
+  ENUM_ENTRY(ATTR_REXW,   (0x1 << 3))   \
+  ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4))   \
+  ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5))   \
+  ENUM_ENTRY(ATTR_VEX,    (0x1 << 6))   \
+  ENUM_ENTRY(ATTR_VEXL,   (0x1 << 7))   \
+  ENUM_ENTRY(ATTR_EVEX,   (0x1 << 8))   \
+  ENUM_ENTRY(ATTR_EVEXL,  (0x1 << 9))   \
+  ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10))  \
+  ENUM_ENTRY(ATTR_EVEXK,  (0x1 << 11))  \
+  ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12))  \
+  ENUM_ENTRY(ATTR_EVEXB,  (0x1 << 13))
 
 #define ENUM_ENTRY(n, v) n = v,
 enum attributeBits {
@@ -73,7 +75,7 @@ enum attributeBits {
 
 /*
  * Combinations of the above attributes that are relevant to instruction
- * decode.  Although other combinations are possible, they can be reduced to
+ * decode. Although other combinations are possible, they can be reduced to
  * these without affecting the ultimately decoded instruction.
  */
 
@@ -198,38 +200,38 @@ enum attributeBits {
   ENUM_ENTRY(IC_EVEX_L2_W_XS_B,     4,  "requires EVEX_B, L2, W and XS prefix")    \
   ENUM_ENTRY(IC_EVEX_L2_W_XD_B,     4,  "requires EVEX_B, L2, W and XD prefix")    \
   ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4,  "requires EVEX_B, L2, W and OpSize")       \
-  ENUM_ENTRY(IC_EVEX_K_B,             1,  "requires EVEX_B and EVEX_K prefix")             \
-  ENUM_ENTRY(IC_EVEX_XS_K_B,          2,  "requires EVEX_B, EVEX_K and the XS prefix")     \
-  ENUM_ENTRY(IC_EVEX_XD_K_B,          2,  "requires EVEX_B, EVEX_K and the XD prefix")     \
-  ENUM_ENTRY(IC_EVEX_OPSIZE_K_B,      2,  "requires EVEX_B, EVEX_K and the OpSize prefix") \
-  ENUM_ENTRY(IC_EVEX_W_K_B,           3,  "requires EVEX_B, EVEX_K and the W prefix")      \
-  ENUM_ENTRY(IC_EVEX_W_XS_K_B,        4,  "requires EVEX_B, EVEX_K, W, and XS prefix")     \
-  ENUM_ENTRY(IC_EVEX_W_XD_K_B,        4,  "requires EVEX_B, EVEX_K, W, and XD prefix")     \
-  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B,    4,  "requires EVEX_B, EVEX_K, W, and OpSize")        \
-  ENUM_ENTRY(IC_EVEX_L_K_B,           3,  "requires EVEX_B, EVEX_K and the L prefix")       \
-  ENUM_ENTRY(IC_EVEX_L_XS_K_B,        4,  "requires EVEX_B, EVEX_K and the L and XS prefix")\
-  ENUM_ENTRY(IC_EVEX_L_XD_K_B,        4,  "requires EVEX_B, EVEX_K and the L and XD prefix")\
-  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B,    4,  "requires EVEX_B, EVEX_K, L, and OpSize")         \
-  ENUM_ENTRY(IC_EVEX_L_W_K_B,         3,  "requires EVEX_B, EVEX_K, L and W")               \
-  ENUM_ENTRY(IC_EVEX_L_W_XS_K_B,      4,  "requires EVEX_B, EVEX_K, L, W and XS prefix")    \
-  ENUM_ENTRY(IC_EVEX_L_W_XD_K_B,      4,  "requires EVEX_B, EVEX_K, L, W and XD prefix")    \
-  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, L, W and OpSize")       \
-  ENUM_ENTRY(IC_EVEX_L2_K_B,          3,  "requires EVEX_B, EVEX_K and the L2 prefix")       \
-  ENUM_ENTRY(IC_EVEX_L2_XS_K_B,       4,  "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
-  ENUM_ENTRY(IC_EVEX_L2_XD_K_B,       4,  "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
-  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B,   4,  "requires EVEX_B, EVEX_K, L2, and OpSize")         \
-  ENUM_ENTRY(IC_EVEX_L2_W_K_B,        3,  "requires EVEX_B, EVEX_K, L2 and W")               \
-  ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
-  ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
-  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
-  ENUM_ENTRY(IC_EVEX_KZ_B,             1,  "requires EVEX_B and EVEX_KZ prefix")             \
-  ENUM_ENTRY(IC_EVEX_XS_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
-  ENUM_ENTRY(IC_EVEX_XD_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
-  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,      2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
-  ENUM_ENTRY(IC_EVEX_W_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
-  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
-  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
-  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_K_B,           1,  "requires EVEX_B and EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K_B,        2,  "requires EVEX_B, EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K_B,        2,  "requires EVEX_B, EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K_B,    2,  "requires EVEX_B, EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K_B,         3,  "requires EVEX_B, EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K_B,         3,  "requires EVEX_B, EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K_B,       3,  "requires EVEX_B, EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K_B,        3,  "requires EVEX_B, EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K_B,      3,  "requires EVEX_B, EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,           1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,    2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
   ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
   ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
   ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
@@ -287,11 +289,9 @@ typedef enum {
   TWOBYTE       = 1,
   THREEBYTE_38  = 2,
   THREEBYTE_3A  = 3,
-  THREEBYTE_A6  = 4,
-  THREEBYTE_A7  = 5,
-  XOP8_MAP      = 6,
-  XOP9_MAP      = 7,
-  XOPA_MAP      = 8
+  XOP8_MAP      = 4,
+  XOP9_MAP      = 5,
+  XOPA_MAP      = 6
 } OpcodeType;
 
 /*
@@ -395,15 +395,17 @@ struct ContextDecision {
   ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
   ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \
   ENUM_ENTRY(ENCODING_RO,     "(RAX..RDI, R8..R15)")                           \
-  ENUM_ENTRY(ENCODING_I,      "Position on floating-point stack added to the " \
-                              "opcode byte")                                   \
+  ENUM_ENTRY(ENCODING_FP,     "Position on floating-point stack in ModR/M "    \
+                              "byte.")                                         \
                                                                                \
   ENUM_ENTRY(ENCODING_Iv,     "Immediate of operand size")                     \
   ENUM_ENTRY(ENCODING_Ia,     "Immediate of address size")                     \
   ENUM_ENTRY(ENCODING_Rv,     "Register code of operand size added to the "    \
                               "opcode byte")                                   \
   ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
-                              "in type")
+                              "in type")                                       \
+  ENUM_ENTRY(ENCODING_SI,     "Source index; encoded in OpSize/Adsize prefix") \
+  ENUM_ENTRY(ENCODING_DI,     "Destination index; encoded in prefixes")
 
 #define ENUM_ENTRY(n, d) n,
   typedef enum {
@@ -454,6 +456,14 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_M16_16,     "2+2-byte (BOUND)")                              \
   ENUM_ENTRY(TYPE_M32_32,     "4+4-byte (BOUND)")                              \
   ENUM_ENTRY(TYPE_M16_64,     "2+8-byte (LIDT, LGDT)")                         \
+  ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX64,   "8-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_DSTIDX8,    "1-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX16,   "2-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX32,   "4-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX64,   "8-byte memory at destination index")            \
   ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
                               "base)")                                         \
   ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
@@ -478,6 +488,7 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
   ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
   ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
   ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
   ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
   ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
@@ -518,9 +529,7 @@ struct OperandSpecifier {
  */
 
 #define MODIFIER_TYPES        \
-  ENUM_ENTRY(MODIFIER_NONE)   \
-  ENUM_ENTRY(MODIFIER_OPCODE) \
-  ENUM_ENTRY(MODIFIER_MODRM)
+  ENUM_ENTRY(MODIFIER_NONE)
 
 #define ENUM_ENTRY(n) n,
 typedef enum {
@@ -536,9 +545,6 @@ typedef enum {
  * its operands.
  */
 struct InstructionSpecifier {
-  uint8_t modifierType;
-  uint8_t modifierBase;
-
   /* The macro below must be defined wherever this file is included. */
   INSTRUCTION_SPECIFIER_FIELDS
 };
diff --git a/lib/Target/X86/InstPrinter/Android.mk b/lib/Target/X86/InstPrinter/Android.mk
index 633df62..9881beb 100644
--- a/lib/Target/X86/InstPrinter/Android.mk
+++ b/lib/Target/X86/InstPrinter/Android.mk
@@ -14,6 +14,7 @@ x86_instprinter_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -32,6 +33,7 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt
index 28e2460..686a37e 100644
--- a/lib/Target/X86/InstPrinter/CMakeLists.txt
+++ b/lib/Target/X86/InstPrinter/CMakeLists.txt
@@ -1,9 +1,5 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMX86AsmPrinter
   X86ATTInstPrinter.cpp
   X86IntelInstPrinter.cpp
   X86InstComments.cpp
   )
-
-add_dependencies(LLVMX86AsmPrinter X86CommonTableGen)
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 4439311..eea0a76 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -123,6 +123,16 @@ void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
   }
 }
 
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value (e.g. for jumps and calls).  These
 /// print slightly differently than normal immediates.  For example, a $ is not
@@ -172,16 +182,16 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
 void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
                                           raw_ostream &O) {
-  const MCOperand &BaseReg  = MI->getOperand(Op);
-  const MCOperand &IndexReg = MI->getOperand(Op+2);
-  const MCOperand &DispSpec = MI->getOperand(Op+3);
-  const MCOperand &SegReg = MI->getOperand(Op+4);
+  const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);
 
   O << markup("<mem:");
 
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
-    printOperand(MI, Op+4, O);
+    printOperand(MI, Op+X86::AddrSegmentReg, O);
     O << ':';
   }
 
@@ -197,12 +207,12 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   if (IndexReg.getReg() || BaseReg.getReg()) {
     O << '(';
     if (BaseReg.getReg())
-      printOperand(MI, Op, O);
+      printOperand(MI, Op+X86::AddrBaseReg, O);
 
     if (IndexReg.getReg()) {
       O << ',';
-      printOperand(MI, Op+2, O);
-      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      printOperand(MI, Op+X86::AddrIndexReg, O);
+      unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
       if (ScaleVal != 1) {
         O << ','
           << markup("<imm:")
@@ -216,12 +226,49 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   O << markup(">");
 }
 
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  const MCOperand &SegReg = MI->getOperand(Op+1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
+  O << "(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  O << markup("<mem:");
+
+  O << "%es:(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
 void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
                                        raw_ostream &O) {
   const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg = MI->getOperand(Op+1);
 
   O << markup("<mem:");
 
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
   if (DispSpec.isImm()) {
     O << formatImm(DispSpec.getImm());
   } else {
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index a8fab72..f34e633 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -19,15 +19,15 @@
 namespace llvm {
 
 class MCOperand;
-  
-class X86ATTInstPrinter : public MCInstPrinter {
+
+class X86ATTInstPrinter final : public MCInstPrinter {
 public:
   X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                     const MCRegisterInfo &MRI)
     : MCInstPrinter(MAI, MII, MRI) {}
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
 
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
@@ -42,7 +42,10 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
 
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
@@ -88,6 +91,30 @@ public:
     printMemReference(MI, OpNo, O);
   }
 
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
   void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemOffset(MI, OpNo, O);
   }
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 0f6eeb1..db61fb0 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -16,7 +16,9 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -38,7 +40,8 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
     Src2Name = getRegName(MI->getOperand(2).getReg());
-    DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
+    if(MI->getOperand(3).isImm())
+      DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
     break;
 
   case X86::MOVLHPSrr:
@@ -65,9 +68,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPALIGNR128rm:
     Src2Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePALIGNRMask(MVT::v16i8,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePALIGNRMask(MVT::v16i8,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
     break;
   case X86::VPALIGNR256rr:
     Src1Name = getRegName(MI->getOperand(2).getReg());
@@ -75,9 +79,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPALIGNR256rm:
     Src2Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePALIGNRMask(MVT::v32i8,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePALIGNRMask(MVT::v32i8,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
     break;
 
   case X86::PSHUFDri:
@@ -87,16 +92,20 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFDmi:
   case X86::VPSHUFDmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFMask(MVT::v4i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                     ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     break;
   case X86::VPSHUFDYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPSHUFDYmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFMask(MVT::v8i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v8i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     break;
 
 
@@ -107,18 +116,20 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFHWmi:
   case X86::VPSHUFHWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFHWMask(MVT::v8i16,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFHWMask(MVT::v8i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
     break;
   case X86::VPSHUFHWYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPSHUFHWYmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFHWMask(MVT::v16i16,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFHWMask(MVT::v16i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
     break;
   case X86::PSHUFLWri:
   case X86::VPSHUFLWri:
@@ -127,18 +138,20 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFLWmi:
   case X86::VPSHUFLWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFLWMask(MVT::v8i16,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFLWMask(MVT::v8i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
     break;
   case X86::VPSHUFLWYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPSHUFLWYmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFLWMask(MVT::v16i16,
-                      MI->getOperand(MI->getNumOperands()-1).getImm(),
-                      ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFLWMask(MVT::v16i16,
+                        MI->getOperand(MI->getNumOperands()-1).getImm(),
+                        ShuffleMask);
     break;
 
   case X86::PUNPCKHBWrr:
@@ -293,8 +306,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::SHUFPDrmi:
   case X86::VSHUFPDrmi:
-    DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -302,8 +317,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VSHUFPDYrmi:
-    DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -314,8 +331,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::SHUFPSrmi:
   case X86::VSHUFPSrmi:
-    DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -323,8 +342,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VSHUFPSYrmi:
-    DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeSHUFPMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -405,32 +426,40 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPERMILPSmi:
-    DecodePSHUFMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPSYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPERMILPSYmi:
-    DecodePSHUFMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPDri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPERMILPDmi:
-    DecodePSHUFMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERMILPDYri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::VPERMILPDYmi:
-    DecodePSHUFMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                       ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSHUFMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VPERM2F128rr:
@@ -440,9 +469,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPERM2F128rm:
   case X86::VPERM2I128rm:
     // For instruction comments purpose, assume the 256-bit vector is v4i64.
-    DecodeVPERM2X128Mask(MVT::v4i64,
-                         MI->getOperand(MI->getNumOperands()-1).getImm(),
-                         ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeVPERM2X128Mask(MVT::v4i64,
+                           MI->getOperand(MI->getNumOperands()-1).getImm(),
+                           ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
@@ -452,8 +482,9 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::VPERMQYmi:
   case X86::VPERMPDYmi:
-    DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   }
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index e7e7b15..1c95d37 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -113,6 +113,17 @@ void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
   }
 }
 
+void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.
 void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
@@ -151,15 +162,15 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
 void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
                                             raw_ostream &O) {
-  const MCOperand &BaseReg  = MI->getOperand(Op);
-  unsigned ScaleVal         = MI->getOperand(Op+1).getImm();
-  const MCOperand &IndexReg = MI->getOperand(Op+2);
-  const MCOperand &DispSpec = MI->getOperand(Op+3);
-  const MCOperand &SegReg   = MI->getOperand(Op+4);
+  const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
   
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
-    printOperand(MI, Op+4, O);
+    printOperand(MI, Op+X86::AddrSegmentReg, O);
     O << ':';
   }
   
@@ -167,7 +178,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
-    printOperand(MI, Op, O);
+    printOperand(MI, Op+X86::AddrBaseReg, O);
     NeedPlus = true;
   }
   
@@ -175,7 +186,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
       O << ScaleVal << '*';
-    printOperand(MI, Op+2, O);
+    printOperand(MI, Op+X86::AddrIndexReg, O);
     NeedPlus = true;
   }
 
@@ -201,9 +212,38 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   O << ']';
 }
 
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+  O << '[';
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  // DI accesses are always ES-based.
+  O << "es:[";
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
 void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
                                          raw_ostream &O) {
   const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
 
   O << '[';
 
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 590bf68..4d9b481 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -20,16 +20,16 @@
 namespace llvm {
 
 class MCOperand;
-  
-class X86IntelInstPrinter : public MCInstPrinter {
+
+class X86IntelInstPrinter final : public MCInstPrinter {
 public:
   X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                       const MCRegisterInfo &MRI)
     : MCInstPrinter(MAI, MII, MRI) {}
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
-  
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
+
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
@@ -40,6 +40,9 @@ public:
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
 
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "opaque ptr ";
@@ -99,6 +102,39 @@ public:
     printMemReference(MI, OpNo, O);
   }
 
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
   void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "byte ptr ";
     printMemOffset(MI, OpNo, O);
diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk
index 65cd8df..ee37c27 100644
--- a/lib/Target/X86/MCTargetDesc/Android.mk
+++ b/lib/Target/X86/MCTargetDesc/Android.mk
@@ -36,6 +36,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device only
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -52,3 +53,4 @@ include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 2eb5f25..3f5a0e2 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -9,8 +9,3 @@ add_llvm_library(LLVMX86Desc
   X86MachORelocationInfo.cpp
   X86ELFRelocationInfo.cpp
   )
-
-add_dependencies(LLVMX86Desc X86CommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index f8e359b..23763f7 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -79,11 +79,11 @@ public:
               CPU != "c3" && CPU != "c3-2";
   }
 
-  unsigned getNumFixupKinds() const {
+  unsigned getNumFixupKinds() const override {
     return X86::NumTargetFixupKinds;
   }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
       { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
       { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
@@ -100,7 +100,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const {
+                  uint64_t Value, bool IsPCRel) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
     assert(Fixup.getOffset() + Size <= DataSize &&
@@ -117,16 +117,15 @@ public:
       Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const;
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
 
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
+                            const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
 } // end anonymous namespace
 
@@ -217,9 +216,9 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
   case X86::CMP64mi8: return X86::CMP64mi32;
 
     // PUSH
-  case X86::PUSHi8: return X86::PUSHi32;
-  case X86::PUSHi16: return X86::PUSHi32;
-  case X86::PUSH64i8: return X86::PUSH64i32;
+  case X86::PUSH32i8:  return X86::PUSHi32;
+  case X86::PUSH16i8:  return X86::PUSHi16;
+  case X86::PUSH64i8:  return X86::PUSH64i32;
   case X86::PUSH64i16: return X86::PUSH64i32;
   }
 }
@@ -314,7 +313,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   };
 
-  // This CPU doesnt support long nops. If needed add more.
+  // This CPU doesn't support long nops. If needed add more.
   // FIXME: Can we get this from the subtarget somehow?
   // FIXME: We could generated something better than plain 0x90.
   if (!HasNopl) {
@@ -347,14 +346,7 @@ class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
   ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
-    : X86AsmBackend(T, CPU), OSABI(_OSABI) {
-    HasReliableSymbolDifference = true;
-  }
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section);
-    return ES.getFlags() & ELF::SHF_MERGE;
-  }
+      : X86AsmBackend(T, CPU), OSABI(_OSABI) {}
 };
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
@@ -362,7 +354,7 @@ public:
   ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
     : ELFX86AsmBackend(T, OSABI, CPU) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
   }
 };
@@ -372,7 +364,7 @@ public:
   ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
     : ELFX86AsmBackend(T, OSABI, CPU) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
   }
 };
@@ -386,7 +378,7 @@ public:
     , Is64Bit(is64Bit) {
   }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86WinCOFFObjectWriter(OS, Is64Bit);
   }
 };
@@ -725,15 +717,15 @@ public:
                          StringRef CPU, bool SupportsCU)
     : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
                                      MachO::CPU_TYPE_I386,
                                      MachO::CPU_SUBTYPE_I386_ALL);
   }
 
   /// \brief Generate the compact unwind encoding for the CFI instructions.
-  virtual uint32_t
-  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
     return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
   }
 };
@@ -747,15 +739,14 @@ public:
                          MachO::CPUSubTypeX86 st)
     : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU),
       Subtype(st) {
-    HasReliableSymbolDifference = true;
   }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
                                      MachO::CPU_TYPE_X86_64, Subtype);
   }
 
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
     // Temporary labels in the string literals sections require symbols. The
     // issue is that the x86_64 relocation format does not allow symbol +
     // offset, and so the linker does not have enough information to resolve the
@@ -765,32 +756,32 @@ public:
     //
     // See <rdar://problem/4765733>.
     const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-    return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS;
+    return SMO.getType() == MachO::S_CSTRING_LITERALS;
   }
 
-  virtual bool isSectionAtomizable(const MCSection &Section) const {
+  bool isSectionAtomizable(const MCSection &Section) const override {
     const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
     // Fixed sized data sections are uniqued, they cannot be diced into atoms.
     switch (SMO.getType()) {
     default:
       return true;
 
-    case MCSectionMachO::S_4BYTE_LITERALS:
-    case MCSectionMachO::S_8BYTE_LITERALS:
-    case MCSectionMachO::S_16BYTE_LITERALS:
-    case MCSectionMachO::S_LITERAL_POINTERS:
-    case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS:
-    case MCSectionMachO::S_LAZY_SYMBOL_POINTERS:
-    case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS:
-    case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS:
-    case MCSectionMachO::S_INTERPOSING:
+    case MachO::S_4BYTE_LITERALS:
+    case MachO::S_8BYTE_LITERALS:
+    case MachO::S_16BYTE_LITERALS:
+    case MachO::S_LITERAL_POINTERS:
+    case MachO::S_NON_LAZY_SYMBOL_POINTERS:
+    case MachO::S_LAZY_SYMBOL_POINTERS:
+    case MachO::S_MOD_INIT_FUNC_POINTERS:
+    case MachO::S_MOD_TERM_FUNC_POINTERS:
+    case MachO::S_INTERPOSING:
       return false;
     }
   }
 
   /// \brief Generate the compact unwind encoding for the CFI instructions.
-  virtual uint32_t
-  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
     return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
   }
 };
@@ -803,12 +794,12 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                            StringRef CPU) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+  if (TheTriple.isOSBinFormatMachO())
     return new DarwinX86_32AsmBackend(T, MRI, CPU,
                                       TheTriple.isMacOSX() &&
                                       !TheTriple.isMacOSXVersionLT(10, 7));
 
-  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
     return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
@@ -821,7 +812,7 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                            StringRef CPU) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+  if (TheTriple.isOSBinFormatMachO()) {
     MachO::CPUSubTypeX86 CS =
         StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
             .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
@@ -831,7 +822,7 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                       !TheTriple.isMacOSXVersionLT(10, 7), CS);
   }
 
-  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
     return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 1ef9814..38fab15 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -18,9 +18,9 @@
 #define X86BASEINFO_H
 
 #include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/MC/MCInstrInfo.h"
 
 namespace llvm {
 
@@ -255,6 +255,38 @@ namespace X86II {
     ///
     MRMSrcMem      = 6,
 
+    /// RawFrmMemOffs - This form is for instructions that store an absolute
+    /// memory offset as an immediate with a possible segment override.
+    RawFrmMemOffs  = 7,
+
+    /// RawFrmSrc - This form is for instructions that use the source index
+    /// register SI/ESI/RSI with a possible segment override.
+    RawFrmSrc      = 8,
+
+    /// RawFrmDst - This form is for instructions that use the destination index
+    /// register DI/EDI/ESI.
+    RawFrmDst      = 9,
+
+    /// RawFrmSrc - This form is for instructions that use the the source index
+    /// register SI/ESI/ERI with a possible segment override, and also the
+    /// destination index register DI/ESI/RDI.
+    RawFrmDstSrc   = 10,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 11,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 12,
+
+    /// MRMX[rm] - The forms are used to represent instructions that use a
+    /// Mod/RM byte, and don't use the middle field for anything.
+    MRMXr = 14, MRMXm = 15,
+
     /// MRM[0-7][rm] - These forms are used to represent instructions that use
     /// a Mod/RM byte, and use the middle field to hold extended opcode
     /// information.  In the intel manual these are represented as /0, /1, ...
@@ -268,94 +300,83 @@ namespace X86II {
     MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
     MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
 
-    // MRMInitReg - This form is used for instructions whose source and
-    // destinations are the same register.
-    MRMInitReg = 32,
-
     //// MRM_XX - A mod/rm byte of exactly 0xXX.
-    MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
-    MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, MRM_CB = 40,
-    MRM_E8 = 41, MRM_F0 = 42, MRM_F8 = 45, MRM_F9 = 46,
-    MRM_D0 = 47, MRM_D1 = 48, MRM_D4 = 49, MRM_D5 = 50,
-    MRM_D6 = 51, MRM_D8 = 52, MRM_D9 = 53, MRM_DA = 54,
-    MRM_DB = 55, MRM_DC = 56, MRM_DD = 57, MRM_DE = 58,
-    MRM_DF = 59,
-
-    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
-    /// immediates, the first of which is a 16-bit immediate (specified by
-    /// the imm encoding) and the second is a 8-bit fixed value.
-    RawFrmImm8 = 43,
-
-    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
-    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
-    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
-    /// manual, this operand is described as pntr16:32 and pntr16:16
-    RawFrmImm16 = 44,
-
-    FormMask       = 63,
+    MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
+    MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39,
+    MRM_CB = 40, MRM_D0 = 41, MRM_D1 = 42, MRM_D4 = 43,
+    MRM_D5 = 44, MRM_D6 = 45, MRM_D8 = 46, MRM_D9 = 47,
+    MRM_DA = 48, MRM_DB = 49, MRM_DC = 50, MRM_DD = 51,
+    MRM_DE = 52, MRM_DF = 53, MRM_E0 = 54, MRM_E1 = 55,
+    MRM_E2 = 56, MRM_E3 = 57, MRM_E4 = 58, MRM_E5 = 59,
+    MRM_E8 = 60, MRM_E9 = 61, MRM_EA = 62, MRM_EB = 63,
+    MRM_EC = 64, MRM_ED = 65, MRM_EE = 66, MRM_F0 = 67,
+    MRM_F1 = 68, MRM_F2 = 69, MRM_F3 = 70, MRM_F4 = 71,
+    MRM_F5 = 72, MRM_F6 = 73, MRM_F7 = 74, MRM_F8 = 75,
+    MRM_F9 = 76, MRM_FA = 77, MRM_FB = 78, MRM_FC = 79,
+    MRM_FD = 80, MRM_FE = 81, MRM_FF = 82,
+
+    FormMask       = 127,
 
     //===------------------------------------------------------------------===//
     // Actual flags...
 
-    // OpSize - Set if this instruction requires an operand size prefix (0x66),
-    // which most often indicates that the instruction operates on 16 bit data
-    // instead of 32 bit data.
-    OpSize      = 1 << 6,
+    // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+    // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+    // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+    // prefix in 16-bit mode.
+    OpSizeShift = 7,
+    OpSizeMask = 0x3 << OpSizeShift,
+
+    OpSize16 = 1,
+    OpSize32 = 2,
 
     // AsSize - Set if this instruction requires an operand size prefix (0x67),
     // which most often indicates that the instruction address 16 bit address
     // instead of 32 bit address (or 32 bit address in 64 bit mode).
-    AdSize      = 1 << 7,
+    AdSizeShift = OpSizeShift + 2,
+    AdSize      = 1 << AdSizeShift,
 
     //===------------------------------------------------------------------===//
-    // Op0Mask - There are several prefix bytes that are used to form two byte
-    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
-    // used to obtain the setting of this field.  If no bits in this field is
-    // set, there is no prefix byte for obtaining a multibyte opcode.
+    // OpPrefix - There are several prefix bytes that are used as opcode
+    // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+    // no prefix.
     //
-    Op0Shift    = 8,
-    Op0Mask     = 0x1F << Op0Shift,
-
-    // TB - TwoByte - Set if this instruction has a two byte opcode, which
-    // starts with a 0x0F byte before the real opcode.
-    TB          = 1 << Op0Shift,
-
-    // REP - The 0xF3 prefix byte indicating repetition of the following
-    // instruction.
-    REP         = 2 << Op0Shift,
+    OpPrefixShift = AdSizeShift + 1,
+    OpPrefixMask  = 0x7 << OpPrefixShift,
 
-    // D8-DF - These escape opcodes are used by the floating point unit.  These
-    // values must remain sequential.
-    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
-    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
-    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
-    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+    // PS, PD - Prefix code for packed single and double precision vector
+    // floating point operations performed in the SSE registers.
+    PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
 
     // XS, XD - These prefix codes are for single and double precision scalar
     // floating point operations performed in the SSE registers.
-    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+    XS = 3 << OpPrefixShift,  XD = 4 << OpPrefixShift,
 
-    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
-    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
-    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
+    //===------------------------------------------------------------------===//
+    // OpMap - This field determines which opcode map this instruction
+    // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+    //
+    OpMapShift = OpPrefixShift + 3,
+    OpMapMask  = 0x7 << OpMapShift,
 
-    // T8XD - Prefix before and after 0x0F. Combination of T8 and XD.
-    T8XD = 17 << Op0Shift,
+    // OB - OneByte - Set if this instruction has a one byte opcode.
+    OB = 0 << OpMapShift,
 
-    // T8XS - Prefix before and after 0x0F. Combination of T8 and XS.
-    T8XS = 18 << Op0Shift,
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB = 1 << OpMapShift,
 
-    // TAXD - Prefix before and after 0x0F. Combination of TA and XD.
-    TAXD = 19 << Op0Shift,
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 2 << OpMapShift,  TA = 3 << OpMapShift,
 
     // XOP8 - Prefix to include use of imm byte.
-    XOP8 = 20 << Op0Shift,
+    XOP8 = 4 << OpMapShift,
 
     // XOP9 - Prefix to exclude use of imm byte.
-    XOP9 = 21 << Op0Shift,
+    XOP9 = 5 << OpMapShift,
 
     // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
-    XOPA = 22 << Op0Shift,
+    XOPA = 6 << OpMapShift,
 
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
@@ -363,27 +384,28 @@ namespace X86II {
     // etc. We only cares about REX.W and REX.R bits and only the former is
     // statically determined.
     //
-    REXShift    = Op0Shift + 5,
+    REXShift    = OpMapShift + 3,
     REX_W       = 1 << REXShift,
 
     //===------------------------------------------------------------------===//
     // This three-bit field describes the size of an immediate operand.  Zero is
     // unused so that we can tell if we forgot to set a value.
     ImmShift = REXShift + 1,
-    ImmMask    = 7 << ImmShift,
+    ImmMask    = 15 << ImmShift,
     Imm8       = 1 << ImmShift,
     Imm8PCRel  = 2 << ImmShift,
     Imm16      = 3 << ImmShift,
     Imm16PCRel = 4 << ImmShift,
     Imm32      = 5 << ImmShift,
     Imm32PCRel = 6 << ImmShift,
-    Imm64      = 7 << ImmShift,
+    Imm32S     = 7 << ImmShift,
+    Imm64      = 8 << ImmShift,
 
     //===------------------------------------------------------------------===//
     // FP Instruction Classification...  Zero is non-fp instruction.
 
     // FPTypeMask - Mask for all of the FP types...
-    FPTypeShift = ImmShift + 3,
+    FPTypeShift = ImmShift + 4,
     FPTypeMask  = 7 << FPTypeShift,
 
     // NotFP - The default, set for instructions that do not use FP registers.
@@ -419,51 +441,64 @@ namespace X86II {
     LOCKShift = FPTypeShift + 3,
     LOCK = 1 << LOCKShift,
 
-    // Segment override prefixes. Currently we just need ability to address
-    // stuff in gs and fs segments.
-    SegOvrShift = LOCKShift + 1,
-    SegOvrMask  = 3 << SegOvrShift,
-    FS          = 1 << SegOvrShift,
-    GS          = 2 << SegOvrShift,
+    // REP prefix
+    REPShift = LOCKShift + 1,
+    REP = 1 << REPShift,
+
+    // Execution domain for SSE instructions.
+    // 0 means normal, non-SSE instruction.
+    SSEDomainShift = REPShift + 1,
+
+    // Encoding
+    EncodingShift = SSEDomainShift + 2,
+    EncodingMask = 0x3 << EncodingShift,
+
+    // VEX - encoding using 0xC4/0xC5
+    VEX = 1,
+
+    /// XOP - Opcode prefix used by XOP instructions.
+    XOP = 2,
 
-    // Execution domain for SSE instructions in bits 23, 24.
-    // 0 in bits 23-24 means normal, non-SSE instruction.
-    SSEDomainShift = SegOvrShift + 2,
+    // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+    // syntax support up to 32 512-bit register operands and up to 7 16-bit
+    // mask operands as well as source operand data swizzling/memory operand
+    // conversion, eviction hint, and rounding mode.
+    EVEX = 3,
 
-    OpcodeShift   = SSEDomainShift + 2,
+    // Opcode
+    OpcodeShift   = EncodingShift + 2,
 
     //===------------------------------------------------------------------===//
     /// VEX - The opcode prefix used by AVX instructions
     VEXShift = OpcodeShift + 8,
-    VEX         = 1U << 0,
 
     /// VEX_W - Has a opcode specific functionality, but is used in the same
     /// way as REX_W is for regular SSE instructions.
-    VEX_W       = 1U << 1,
+    VEX_W       = 1U << 0,
 
     /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
     /// address instructions in SSE are represented as 3 address ones in AVX
     /// and the additional register is encoded in VEX_VVVV prefix.
-    VEX_4V      = 1U << 2,
+    VEX_4V      = 1U << 1,
 
     /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
     /// operand 3 with VEX.vvvv.
-    VEX_4VOp3   = 1U << 3,
+    VEX_4VOp3   = 1U << 2,
 
     /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
     /// must be encoded in the i8 immediate field. This usually happens in
     /// instructions with 4 operands.
-    VEX_I8IMM   = 1U << 4,
+    VEX_I8IMM   = 1U << 3,
 
     /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
     /// instruction uses 256-bit wide registers. This is usually auto detected
     /// if a VR256 register is used, but some AVX instructions also have this
     /// field marked when using a f256 memory references.
-    VEX_L       = 1U << 5,
+    VEX_L       = 1U << 4,
 
     // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
     // prefix. Usually used for scalar instructions. Needed by disassembler.
-    VEX_LIG     = 1U << 6,
+    VEX_LIG     = 1U << 5,
 
     // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
     // with following encoding:
@@ -473,26 +508,20 @@ namespace X86II {
     // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
     // this will save 1 tsflag bit
 
-    // VEX_EVEX - Specifies that this instruction use EVEX form which provides
-    // syntax support up to 32 512-bit register operands and up to 7 16-bit
-    // mask operands as well as source operand data swizzling/memory operand
-    // conversion, eviction hint, and rounding mode.
-    EVEX        = 1U << 7,
-
     // EVEX_K - Set if this instruction requires masking
-    EVEX_K      = 1U << 8,
+    EVEX_K      = 1U << 6,
 
     // EVEX_Z - Set if this instruction has EVEX.Z field set.
-    EVEX_Z      = 1U << 9,
+    EVEX_Z      = 1U << 7,
 
     // EVEX_L2 - Set if this instruction has EVEX.L' field set.
-    EVEX_L2     = 1U << 10,
+    EVEX_L2     = 1U << 8,
 
     // EVEX_B - Set if this instruction has EVEX.B field set.
-    EVEX_B      = 1U << 11,
+    EVEX_B      = 1U << 9,
 
     // EVEX_CD8E - compressed disp8 form, element-size
-    EVEX_CD8EShift = VEXShift + 12,
+    EVEX_CD8EShift = VEXShift + 10,
     EVEX_CD8EMask = 3,
 
     // EVEX_CD8V - compressed disp8 form, vector-width
@@ -505,15 +534,14 @@ namespace X86II {
     /// storing a classifier in the imm8 field.  To simplify our implementation,
     /// we handle this by storeing the classifier in the opcode field and using
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcode = 1U << 17,
+    Has3DNow0F0FOpcode = 1U << 15,
 
     /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
     /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
-    MemOp4 = 1U << 18,
-
-    /// XOP - Opcode prefix used by XOP instructions.
-    XOP = 1U << 19
+    MemOp4 = 1U << 16,
 
+    /// Explicitly specified rounding control
+    EVEX_RC = 1U << 17
   };
 
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
@@ -537,6 +565,7 @@ namespace X86II {
     case X86II::Imm16:
     case X86II::Imm16PCRel: return 2;
     case X86II::Imm32:
+    case X86II::Imm32S:
     case X86II::Imm32PCRel: return 4;
     case X86II::Imm64:      return 8;
     }
@@ -554,6 +583,25 @@ namespace X86II {
     case X86II::Imm8:
     case X86II::Imm16:
     case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// isImmSigned - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is signed.
+  inline unsigned isImmSigned(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate signedness");
+    case X86II::Imm32S:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:
+    case X86II::Imm16:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32:
+    case X86II::Imm32PCRel:
     case X86II::Imm64:
       return false;
     }
@@ -596,9 +644,6 @@ namespace X86II {
   ///
   inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
     switch (TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:
-        // FIXME: Remove this form.
-        return -1;
     default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
     case X86II::Pseudo:
     case X86II::RawFrm:
@@ -607,14 +652,17 @@ namespace X86II {
     case X86II::MRMSrcReg:
     case X86II::RawFrmImm8:
     case X86II::RawFrmImm16:
+    case X86II::RawFrmMemOffs:
+    case X86II::RawFrmSrc:
+    case X86II::RawFrmDst:
+    case X86II::RawFrmDstSrc:
        return -1;
     case X86II::MRMDestMem:
       return 0;
     case X86II::MRMSrcMem: {
       bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
       bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-      bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
-      bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+      bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@@ -627,11 +675,13 @@ namespace X86II {
       //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
       return FirstMemOp;
     }
+    case X86II::MRMXr:
     case X86II::MRM0r: case X86II::MRM1r:
     case X86II::MRM2r: case X86II::MRM3r:
     case X86II::MRM4r: case X86II::MRM5r:
     case X86II::MRM6r: case X86II::MRM7r:
       return -1;
+    case X86II::MRMXm:
     case X86II::MRM0m: case X86II::MRM1m:
     case X86II::MRM2m: case X86II::MRM3m:
     case X86II::MRM4m: case X86II::MRM5m:
@@ -642,15 +692,23 @@ namespace X86II {
         ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
       return FirstMemOp;
     }
-    case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
-    case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
-    case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_E8:
-    case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9:
+    case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+    case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+    case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
     case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
     case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
     case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
     case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
-    case X86II::MRM_DF:
+    case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
+    case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
+    case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
+    case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+    case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
+    case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
+    case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
+    case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
+    case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
+    case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
       return -1;
     }
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 3ddd865..c44d88d 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -24,9 +24,8 @@ namespace {
 
     virtual ~X86ELFObjectWriter();
   protected:
-    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                  bool IsPCRel, bool IsRelocWithSymbol,
-                                  int64_t Addend) const;
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
   };
 }
 
@@ -41,13 +40,10 @@ X86ELFObjectWriter::~X86ELFObjectWriter()
 
 unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
                                           const MCFixup &Fixup,
-                                          bool IsPCRel,
-                                          bool IsRelocWithSymbol,
-                                          int64_t Addend) const {
+                                          bool IsPCRel) const {
   // determine the type of the relocation
 
-  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
-    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
   unsigned Type;
   if (getEMachine() == ELF::EM_X86_64) {
     if (IsPCRel) {
@@ -57,6 +53,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
       case FK_Data_8: Type = ELF::R_X86_64_PC64; break;
       case FK_Data_4: Type = ELF::R_X86_64_PC32; break;
       case FK_Data_2: Type = ELF::R_X86_64_PC16; break;
+      case FK_Data_1: Type = ELF::R_X86_64_PC8; break;
 
       case FK_PCRel_8:
         assert(Modifier == MCSymbolRefExpr::VK_None);
@@ -160,6 +157,28 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         Type = ELF::R_386_GOTPC;
         break;
 
+      case FK_PCRel_1:
+      case FK_Data_1:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_PC8;
+          break;
+        }
+        break;
+
+      case FK_PCRel_2:
+      case FK_Data_2:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_PC16;
+          break;
+        }
+        break;
+
       case X86::reloc_signed_4byte:
       case FK_PCRel_4:
       case FK_Data_4:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index a3eb4fb..4fa519c 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -11,8 +11,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/ELF.h"
 
@@ -25,7 +25,7 @@ class X86_64ELFRelocationInfo : public MCRelocationInfo {
 public:
   X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
 
-  const MCExpr *createExprForRelocation(RelocationRef Rel) {
+  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
     uint64_t RelType; Rel.getType(RelType);
     symbol_iterator SymI = Rel.getSymbol();
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 3861e1c..6561804 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -65,6 +65,19 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // old assembler lacks some directives
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  // FIXME: this should not depend on the target OS version, but on the ld64
+  // version in use.  From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified
+  // FDE relocs may be used.
+  DwarfFDESymbolsUseAbsDiff = T.isMacOSX() && !T.isMacOSXVersionLT(10, 6);
+
+  UseIntegratedAssembler = true;
 }
 
 X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
@@ -89,8 +102,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
 
   TextAlignFillValue = 0x90;
 
-  PrivateGlobalPrefix = ".L";
-
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
 
@@ -105,6 +116,10 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
        T.getArch() == Triple::x86)
     Data64bitsDirective = 0;
+
+  // Always enable the integrated assembler by default.
+  // Clang also enabled it when the OS is Solaris but that is redundant here.
+  UseIntegratedAssembler = true;
 }
 
 const MCExpr *
@@ -127,25 +142,23 @@ getNonexecutableStackSection(MCContext &Ctx) const {
 void X86MCAsmInfoMicrosoft::anchor() { }
 
 X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
-  if (Triple.getArch() == Triple::x86_64) {
-    GlobalPrefix = "";
+  if (Triple.getArch() == Triple::x86_64)
     PrivateGlobalPrefix = ".L";
-  }
 
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
 
   AllowAtInName = true;
+
+  UseIntegratedAssembler = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
 
 X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
-  if (Triple.getArch() == Triple::x86_64) {
-    GlobalPrefix = "";
+  if (Triple.getArch() == Triple::x86_64)
     PrivateGlobalPrefix = ".L";
-  }
 
   AssemblerDialect = AsmWriterFlavor;
 
@@ -153,4 +166,6 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index 80979dd..a7509b0 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -23,34 +23,34 @@ namespace llvm {
   class Triple;
 
   class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit X86MCAsmInfoDarwin(const Triple &Triple);
   };
 
   struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
     explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
-    virtual const MCExpr *
-    getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                unsigned Encoding,
-                                MCStreamer &Streamer) const;
+    const MCExpr *
+    getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                                MCStreamer &Streamer) const override;
   };
 
   class X86ELFMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit X86ELFMCAsmInfo(const Triple &Triple);
-    virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
+    const MCSection *
+    getNonexecutableStackSection(MCContext &Ctx) const override;
   };
 
   class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
   };
 
   class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
   };
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 7952607..e6fb037 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -32,24 +32,43 @@ class X86MCCodeEmitter : public MCCodeEmitter {
   X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
-  const MCSubtargetInfo &STI;
   MCContext &Ctx;
 public:
-  X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx)
-    : MCII(mcii), STI(sti), Ctx(ctx) {
+  X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
   }
 
   ~X86MCCodeEmitter() {}
 
-  bool is64BitMode() const {
-    // FIXME: Can tablegen auto-generate this?
+  bool is64BitMode(const MCSubtargetInfo &STI) const {
     return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
   }
 
-  bool is32BitMode() const {
-    // FIXME: Can tablegen auto-generate this?
-    return (STI.getFeatureBits() & X86::Mode64Bit) == 0;
+  bool is32BitMode(const MCSubtargetInfo &STI) const {
+    return (STI.getFeatureBits() & X86::Mode32Bit) != 0;
+  }
+
+  bool is16BitMode(const MCSubtargetInfo &STI) const {
+    return (STI.getFeatureBits() & X86::Mode16Bit) != 0;
+  }
+
+  /// Is16BitMemOperand - Return true if the specified instruction has
+  /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+  bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
+                         const MCSubtargetInfo &STI) const {
+    const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+    const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+    const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+
+    if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
+        Disp.isImm() && Disp.getImm() < 0x10000)
+      return true;
+    if ((BaseReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+        (IndexReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+      return true;
+    return false;
   }
 
   unsigned GetX86RegNum(const MCOperand &MO) const {
@@ -126,21 +145,23 @@ public:
   void EmitMemModRMByte(const MCInst &MI, unsigned Op,
                         unsigned RegOpcodeField,
                         uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
-                        SmallVectorImpl<MCFixup> &Fixups) const;
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
 
   void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
                            const MCInst &MI, const MCInstrDesc &Desc,
                            raw_ostream &OS) const;
 
-  void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                 int MemOperand, const MCInst &MI,
-                                 raw_ostream &OS) const;
+  void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+                                 const MCInst &MI, raw_ostream &OS) const;
 
   void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
                         const MCInst &MI, const MCInstrDesc &Desc,
+                        const MCSubtargetInfo &STI,
                         raw_ostream &OS) const;
 };
 
@@ -151,7 +172,7 @@ MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
                                             const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
-  return new X86MCCodeEmitter(MCII, STI, Ctx);
+  return new X86MCCodeEmitter(MCII, Ctx);
 }
 
 /// isDisp8 - Return true if this signed displacement fits in a 8-bit
@@ -163,7 +184,7 @@ static bool isDisp8(int Value) {
 /// isCDisp8 - Return true if this signed displacement fits in a 8-bit
 /// compressed dispacement field.
 static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
-  assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) &&
+  assert((TSFlags & X86II::EncodingMask) >> X86II::EncodingShift == X86II::EVEX &&
          "Compressed 8-bit displacement is only valid for EVEX inst.");
 
   unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask;
@@ -198,7 +219,7 @@ static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
 
   if (Value & MemObjMask) // Unaligned offset
     return false;
-  Value /= MemObjSize;
+  Value /= (int)MemObjSize;
   bool Ret = (Value == (signed char)Value);
 
   if (Ret)
@@ -212,6 +233,12 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
   unsigned Size = X86II::getSizeOfImm(TSFlags);
   bool isPCRel = X86II::isImmPCRel(TSFlags);
 
+  if (X86II::isImmSigned(TSFlags)) {
+    switch (Size) {
+    default: llvm_unreachable("Unsupported signed fixup size!");
+    case 4: return MCFixupKind(X86::reloc_signed_4byte);
+    }
+  }
   return MCFixup::getKindForSize(Size, isPCRel);
 }
 
@@ -245,20 +272,6 @@ static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
 }
 #endif
 
-/// Is16BitMemOperand - Return true if the specified instruction has
-/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
-static bool Is16BitMemOperand(const MCInst &MI, unsigned Op) {
-  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
-  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
-    return true;
-  return false;
-}
-
 /// StartsWithGlobalOffsetTable - Check if this expression starts with
 ///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
 ///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
@@ -366,17 +379,20 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
                                         unsigned RegOpcodeField,
                                         uint64_t TSFlags, unsigned &CurByte,
                                         raw_ostream &OS,
-                                        SmallVectorImpl<MCFixup> &Fixups) const{
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const{
   const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
   const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
   const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
   const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
   unsigned BaseReg = Base.getReg();
-  bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+  bool HasEVEX = (Encoding == X86II::EVEX);
 
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
-    assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode");
+    assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
     assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
     EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
 
@@ -402,6 +418,66 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
 
   unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
 
+  // 16-bit addressing forms of the ModR/M byte have a different encoding for
+  // the R/M field and are far more limited in which registers can be used.
+  if (Is16BitMemOperand(MI, Op, STI)) {
+    if (BaseReg) {
+      // For 32-bit addressing, the row and column values in Table 2-2 are
+      // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+      // some special cases. And GetX86RegNum reflects that numbering.
+      // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+      // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+      // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+      // while values 0-3 indicate the allowed combinations (base+index) of
+      // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+      //
+      // R16Table[] is a lookup from the normal RegNo, to the row values from
+      // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+      static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+      unsigned RMfield = R16Table[BaseRegNo];
+
+      assert(RMfield && "invalid 16-bit base register");
+
+      if (IndexReg.getReg()) {
+        unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+
+        assert(IndexReg16 && "invalid 16-bit index register");
+        // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+        assert(((IndexReg16 ^ RMfield) & 2) &&
+               "invalid 16-bit base/index register combination");
+        assert(Scale.getImm() == 1 &&
+               "invalid scale for 16-bit memory reference");
+
+        // Allow base/index to appear in either order (although GAS doesn't).
+        if (IndexReg16 & 2)
+          RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+        else
+          RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+      }
+
+      if (Disp.isImm() && isDisp8(Disp.getImm())) {
+        if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+          // There is no displacement; just the register.
+          EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+          return;
+        }
+        // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+        EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // This is the [REG]+disp16 case.
+      EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+    } else {
+      // There is no BaseReg; this is the plain [disp16] case.
+      EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+    }
+
+    // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+    EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+    return;
+  }
+
   // Determine whether a SIB byte is needed.
   // If no BaseReg, issue a RIP relative instruction only if the MCE can
   // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
@@ -415,7 +491,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
       BaseRegNo != N86::ESP &&
       // If there is no base register and we're in 64-bit mode, we need a SIB
       // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
-      (!is64BitMode() || BaseReg != 0)) {
+      (!is64BitMode(STI) || BaseReg != 0)) {
 
     if (BaseReg == 0) {          // [disp32]     in X86-32 mode
       EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
@@ -530,11 +606,13 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                            int MemOperand, const MCInst &MI,
                                            const MCInstrDesc &Desc,
                                            raw_ostream &OS) const {
-  bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
-  bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
   bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  bool HasEVEX_RC = (TSFlags >> X86II::VEXShift) & X86II::EVEX_RC;
 
   // VEX_R: opcode externsion equivalent to REX.R in
   // 1's complement (inverted) form
@@ -563,9 +641,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   // opcode extension, or ignored, depending on the opcode byte)
   unsigned char VEX_W = 0;
 
-  // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  bool XOP = false;
-
   // VEX_5M (VEX m-mmmmm field):
   //
   //  0b00000: Reserved for future use
@@ -576,7 +651,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b01000: XOP map select - 08h instructions with imm byte
   //  0b01001: XOP map select - 09h instructions with no imm byte
   //  0b01010: XOP map select - 0Ah instructions with imm dword
-  unsigned char VEX_5M = 0x1;
+  unsigned char VEX_5M = 0;
 
   // VEX_4V (VEX vvvv field): a register specifier
   // (in 1's complement form) or 1111 if unused.
@@ -610,91 +685,53 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   // EVEX_b
   unsigned char EVEX_b = 0;
 
+  // EVEX_rc
+  unsigned char EVEX_rc = 0;
+
   // EVEX_aaa
   unsigned char EVEX_aaa = 0;
 
-  // Encode the operand size opcode prefix as needed.
-  if (TSFlags & X86II::OpSize)
-    VEX_PP = 0x01;
+  bool EncodeRC = false;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
     VEX_W = 1;
 
-  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = true;
-
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
-  if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
+  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
     EVEX_L2 = 1;
 
   if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
     EVEX_z = 1;
 
-  if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
+  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
     EVEX_b = 1;
 
-  switch (TSFlags & X86II::Op0Mask) {
-  default: llvm_unreachable("Invalid prefix!");
-  case X86II::T8:  // 0F 38
-    VEX_5M = 0x2;
-    break;
-  case X86II::TA:  // 0F 3A
-    VEX_5M = 0x3;
-    break;
-  case X86II::T8XS: // F3 0F 38
-    VEX_PP = 0x2;
-    VEX_5M = 0x2;
-    break;
-  case X86II::T8XD: // F2 0F 38
-    VEX_PP = 0x3;
-    VEX_5M = 0x2;
-    break;
-  case X86II::TAXD: // F2 0F 3A
-    VEX_PP = 0x3;
-    VEX_5M = 0x3;
-    break;
-  case X86II::XS:  // F3 0F
-    VEX_PP = 0x2;
-    break;
-  case X86II::XD:  // F2 0F
-    VEX_PP = 0x3;
-    break;
-  case X86II::XOP8:
-    VEX_5M = 0x8;
-    break;
-  case X86II::XOP9:
-    VEX_5M = 0x9;
-    break;
-  case X86II::XOPA:
-    VEX_5M = 0xA;
-    break;
-  case X86II::TB: // VEX_5M/VEX_PP already correct
-    break;
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: break; // VEX_PP already correct
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
   }
 
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
 
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc.getNumOperands();
-  unsigned CurOp = 0;
-  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
-    ++CurOp;
-  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
-           Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
-    // Special case for AVX-512 GATHER with 2 TIED_TO operands
-    // Skip the first 2 operands: dst, mask_wb
-    CurOp += 2;
-  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
-           Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
-    // Special case for GATHER with 2 TIED_TO operands
-    // Skip the first 2 operands: dst, mask_wb
-    CurOp += 2;
-  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
-    // SCATTER
-    ++CurOp;
+  unsigned CurOp = X86II::getOperandBias(Desc);
 
   switch (TSFlags & X86II::FormMask) {
-  case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
+  default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+  case X86II::RawFrm:
+    break;
   case X86II::MRMDestMem: {
     // MRMDestMem instructions forms:
     //  MemAddr, src1(ModR/M)
@@ -707,7 +744,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
                                                  X86::AddrIndexReg).getReg()))
       VEX_X = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
                                           X86::AddrIndexReg).getReg()))
       EVEX_V2 = 0x0;
 
@@ -718,7 +755,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
     if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
       CurOp++;
     }
@@ -727,7 +764,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     if (MO.isReg()) {
       if (X86II::isX86_64ExtendedReg(MO.getReg()))
         VEX_R = 0x0;
-      if (HasEVEX && X86II::is32ExtendedReg(MO.getReg()))
+      if (X86II::is32ExtendedReg(MO.getReg()))
         EVEX_R2 = 0x0;
     }
     break;
@@ -744,7 +781,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
       EVEX_R2 = 0x0;
     CurOp++;
 
@@ -753,7 +790,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
     if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
       CurOp++;
     }
@@ -764,8 +801,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     if (X86II::isX86_64ExtendedReg(
                MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
       VEX_X = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
-                                          X86::AddrIndexReg).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                               X86::AddrIndexReg).getReg()))
       EVEX_V2 = 0x0;
 
     if (HasVEX_4VOp3)
@@ -785,7 +822,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  src1(VEX_4V), MemAddr
     if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
       CurOp++;
     }
@@ -812,7 +849,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
       EVEX_R2 = 0x0;
     CurOp++;
 
@@ -821,7 +858,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
     if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
       CurOp++;
     }
@@ -831,11 +868,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_X = 0x0;
     CurOp++;
     if (HasVEX_4VOp3)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (EVEX_b) {
+      if (HasEVEX_RC) {
+        unsigned RcOperand = NumOps-1;
+        assert(RcOperand >= CurOp);
+        EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+      }
+      EncodeRC = true;
+    }      
     break;
   case X86II::MRMDestReg:
     // MRMDestReg instructions forms:
@@ -844,7 +889,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_X = 0x0;
     CurOp++;
 
@@ -853,15 +898,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
     if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
       CurOp++;
     }
 
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
       EVEX_R2 = 0x0;
+    if (EVEX_b)
+      EncodeRC = true;
     break;
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
@@ -871,26 +918,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(VEX_4V), src(ModR/M), imm8
     if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
           EVEX_V2 = 0x0;
       CurOp++;
-    }    
+    }
     if (HasEVEX_K)
       EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
 
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_X = 0x0;
     break;
-  default: // RawFrm
-    break;
   }
 
-  // Emit segment override opcode prefix as needed.
-  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
-
-  if (!HasEVEX) {
+  if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
     // VEX opcode prefix can have 2 or 3 bytes
     //
     //  3 bytes:
@@ -902,19 +944,25 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //    | C5h | | R | vvvv | L | pp |
     //    +-----+ +-------------------+
     //
+    //  XOP uses a similar prefix:
+    //    +-----+ +--------------+ +-------------------+
+    //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
     unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
 
-    if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+    // Can we use the 2 byte VEX prefix?
+    if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
       EmitByte(0xC5, CurByte, OS);
       EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
       return;
     }
 
     // 3 byte VEX prefix
-    EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
+    EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
     EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
     EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
   } else {
+    assert(Encoding == X86II::EVEX && "unknown encoding!");
     // EVEX opcode prefix can have 4 bytes
     //
     // +-----+ +--------------+ +-------------------+ +------------------------+
@@ -935,12 +983,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
              (VEX_4V  << 3) |
              (EVEX_U  << 2) |
              VEX_PP, CurByte, OS);
-    EmitByte((EVEX_z  << 7) |
-             (EVEX_L2 << 6) |
-             (VEX_L   << 5) |
-             (EVEX_b  << 4) |
-             (EVEX_V2 << 3) |
-             EVEX_aaa, CurByte, OS);
+    if (EncodeRC)
+      EmitByte((EVEX_z  << 7) |
+              (EVEX_rc << 5) |
+              (EVEX_b  << 4) |
+              (EVEX_V2 << 3) |
+              EVEX_aaa, CurByte, OS);
+    else
+      EmitByte((EVEX_z  << 7) |
+              (EVEX_L2 << 6) |
+              (VEX_L   << 5) |
+              (EVEX_b  << 4) |
+              (EVEX_V2 << 3) |
+              EVEX_aaa, CurByte, OS);
   }
 }
 
@@ -974,7 +1029,6 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
   }
 
   switch (TSFlags & X86II::FormMask) {
-  case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
   case X86II::MRMSrcReg:
     if (MI.getOperand(0).isReg() &&
         X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
@@ -1002,6 +1056,7 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
     }
     break;
   }
+  case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
@@ -1039,33 +1094,20 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
 }
 
 /// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
-void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
-                                        unsigned &CurByte, int MemOperand,
-                                        const MCInst &MI,
-                                        raw_ostream &OS) const {
-  switch (TSFlags & X86II::SegOvrMask) {
-  default: llvm_unreachable("Invalid segment!");
-  case 0:
-    // No segment override, check for explicit one on memory operand.
-    if (MemOperand != -1) {   // If the instruction has a memory operand.
-      switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
-      default: llvm_unreachable("Unknown segment register!");
-      case 0: break;
-      case X86::CS: EmitByte(0x2E, CurByte, OS); break;
-      case X86::SS: EmitByte(0x36, CurByte, OS); break;
-      case X86::DS: EmitByte(0x3E, CurByte, OS); break;
-      case X86::ES: EmitByte(0x26, CurByte, OS); break;
-      case X86::FS: EmitByte(0x64, CurByte, OS); break;
-      case X86::GS: EmitByte(0x65, CurByte, OS); break;
-      }
-    }
-    break;
-  case X86II::FS:
-    EmitByte(0x64, CurByte, OS);
-    break;
-  case X86II::GS:
-    EmitByte(0x65, CurByte, OS);
-    break;
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+                                                 unsigned SegOperand,
+                                                 const MCInst &MI,
+                                                 raw_ostream &OS) const {
+  // Check for explicit segment override on memory operand.
+  switch (MI.getOperand(SegOperand).getReg()) {
+  default: llvm_unreachable("Unknown segment register!");
+  case 0: break;
+  case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+  case X86::SS: EmitByte(0x36, CurByte, OS); break;
+  case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+  case X86::ES: EmitByte(0x26, CurByte, OS); break;
+  case X86::FS: EmitByte(0x64, CurByte, OS); break;
+  case X86::GS: EmitByte(0x65, CurByte, OS); break;
   }
 }
 
@@ -1076,118 +1118,56 @@ void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
 void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                         int MemOperand, const MCInst &MI,
                                         const MCInstrDesc &Desc,
+                                        const MCSubtargetInfo &STI,
                                         raw_ostream &OS) const {
 
-  // Emit the lock opcode prefix as needed.
-  if (TSFlags & X86II::LOCK)
-    EmitByte(0xF0, CurByte, OS);
-
-  // Emit segment override opcode prefix as needed.
-  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
-
-  // Emit the repeat opcode prefix as needed.
-  if ((TSFlags & X86II::Op0Mask) == X86II::REP)
-    EmitByte(0xF3, CurByte, OS);
-
-  // Emit the address size opcode prefix as needed.
-  bool need_address_override;
-  if (TSFlags & X86II::AdSize) {
-    need_address_override = true;
-  } else if (MemOperand == -1) {
-    need_address_override = false;
-  } else if (is64BitMode()) {
-    assert(!Is16BitMemOperand(MI, MemOperand));
-    need_address_override = Is32BitMemOperand(MI, MemOperand);
-  } else if (is32BitMode()) {
-    assert(!Is64BitMemOperand(MI, MemOperand));
-    need_address_override = Is16BitMemOperand(MI, MemOperand);
-  } else {
-    need_address_override = false;
-  }
-
-  if (need_address_override)
-    EmitByte(0x67, CurByte, OS);
-
   // Emit the operand size opcode prefix as needed.
-  if (TSFlags & X86II::OpSize)
+  unsigned char OpSize = (TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift;
+  if (OpSize == (is16BitMode(STI) ? X86II::OpSize32 : X86II::OpSize16))
     EmitByte(0x66, CurByte, OS);
 
-  bool Need0FPrefix = false;
-  switch (TSFlags & X86II::Op0Mask) {
-  default: llvm_unreachable("Invalid prefix!");
-  case 0: break;  // No prefix!
-  case X86II::REP: break; // already handled.
-  case X86II::TB:  // Two-byte opcode prefix
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
-  case X86II::A6:  // 0F A6
-  case X86II::A7:  // 0F A7
-    Need0FPrefix = true;
-    break;
-  case X86II::T8XS: // F3 0F 38
-    EmitByte(0xF3, CurByte, OS);
-    Need0FPrefix = true;
-    break;
-  case X86II::T8XD: // F2 0F 38
-    EmitByte(0xF2, CurByte, OS);
-    Need0FPrefix = true;
-    break;
-  case X86II::TAXD: // F2 0F 3A
-    EmitByte(0xF2, CurByte, OS);
-    Need0FPrefix = true;
+  switch (TSFlags & X86II::OpPrefixMask) {
+  case X86II::PD:   // 66
+    EmitByte(0x66, CurByte, OS);
     break;
-  case X86II::XS:   // F3 0F
+  case X86II::XS:   // F3
     EmitByte(0xF3, CurByte, OS);
-    Need0FPrefix = true;
     break;
-  case X86II::XD:   // F2 0F
+  case X86II::XD:   // F2
     EmitByte(0xF2, CurByte, OS);
-    Need0FPrefix = true;
     break;
-  case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
-  case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
-  case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
-  case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
-  case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
-  case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
-  case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
-  case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
   }
 
   // Handle REX prefix.
   // FIXME: Can this come before F2 etc to simplify emission?
-  if (is64BitMode()) {
+  if (is64BitMode(STI)) {
     if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
       EmitByte(0x40 | REX, CurByte, OS);
   }
 
   // 0x0F escape code must be emitted just before the opcode.
-  if (Need0FPrefix)
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::TB:  // Two-byte opcode map
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
     EmitByte(0x0F, CurByte, OS);
+    break;
+  }
 
-  // FIXME: Pull this up into previous switch if REX can be moved earlier.
-  switch (TSFlags & X86II::Op0Mask) {
-  case X86II::T8XS:  // F3 0F 38
-  case X86II::T8XD:  // F2 0F 38
+  switch (TSFlags & X86II::OpMapMask) {
   case X86II::T8:    // 0F 38
     EmitByte(0x38, CurByte, OS);
     break;
-  case X86II::TAXD:  // F2 0F 3A
   case X86II::TA:    // 0F 3A
     EmitByte(0x3A, CurByte, OS);
     break;
-  case X86II::A6:    // 0F A6
-    EmitByte(0xA6, CurByte, OS);
-    break;
-  case X86II::A7:    // 0F A7
-    EmitByte(0xA7, CurByte, OS);
-    break;
   }
 }
 
 void X86MCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
   unsigned Opcode = MI.getOpcode();
   const MCInstrDesc &Desc = MCII.get(Opcode);
   uint64_t TSFlags = Desc.TSFlags;
@@ -1202,8 +1182,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // Keep track of the current byte being emitted.
   unsigned CurByte = 0;
 
-  // Is this instruction encoded using the AVX VEX prefix?
-  bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+  // Encoding type for this instruction.
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
 
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
@@ -1212,15 +1193,58 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   const unsigned MemOp4_I8IMMOperand = 2;
 
   // It uses the EVEX.aaa field?
-  bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
-  bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
-
+  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+  bool HasEVEX_RC = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_RC);
+  
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
 
-  if (!HasVEXPrefix)
-    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+  // Emit the lock opcode prefix as needed.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  // Emit segment override opcode prefix as needed.
+  if (MemoryOperand >= 0)
+    EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
+                              MI, OS);
+
+  // Emit the repeat opcode prefix as needed.
+  if (TSFlags & X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  // The AdSize prefix is only for 32-bit and 64-bit modes. Hm, perhaps we
+  // should introduce an AdSize16 bit instead of having seven special cases?
+  if ((!is16BitMode(STI) && TSFlags & X86II::AdSize) ||
+      (is16BitMode(STI) && (MI.getOpcode() == X86::JECXZ_32 ||
+                         MI.getOpcode() == X86::MOV8o8a ||
+                         MI.getOpcode() == X86::MOV16o16a ||
+                         MI.getOpcode() == X86::MOV32o32a ||
+                         MI.getOpcode() == X86::MOV8ao8 ||
+                         MI.getOpcode() == X86::MOV16ao16 ||
+                         MI.getOpcode() == X86::MOV32ao32))) {
+    need_address_override = true;
+  } else if (MemoryOperand < 0) {
+    need_address_override = false;
+  } else if (is64BitMode(STI)) {
+    assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
+    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+  } else if (is32BitMode(STI)) {
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
+  } else {
+    assert(is16BitMode(STI));
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
+  }
+
+  if (need_address_override)
+    EmitByte(0x67, CurByte, OS);
+
+  if (Encoding == 0)
+    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
   else
     EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
 
@@ -1231,15 +1255,62 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   unsigned SrcRegNum = 0;
   switch (TSFlags & X86II::FormMask) {
-  case X86II::MRMInitReg:
-    llvm_unreachable("FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
   default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
     llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
   case X86II::Pseudo:
     llvm_unreachable("Pseudo instruction shouldn't be emitted");
+  case X86II::RawFrmDstSrc: {
+    unsigned siReg = MI.getOperand(1).getReg();
+    assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+            (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+            (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+           "SI and DI register sizes do not match");
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(2).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 3; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmSrc: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(1).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 2; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmDst: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::EDI) ||
+        (is32BitMode(STI) && siReg == X86::DI))
+      EmitByte(0x67, CurByte, OS);
+    ++CurOp; // Consume operand.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
   case X86II::RawFrm:
     EmitByte(BaseOpcode, CurByte, OS);
     break;
+  case X86II::RawFrmMemOffs:
+    // Emit segment override opcode prefix as needed.
+    EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    ++CurOp; // skip segment operand
+    break;
   case X86II::RawFrmImm8:
     EmitByte(BaseOpcode, CurByte, OS);
     EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
@@ -1288,7 +1359,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
     EmitMemModRMByte(MI, CurOp,
                      GetX86RegNum(MI.getOperand(SrcRegNum)),
-                     TSFlags, CurByte, OS, Fixups);
+                     TSFlags, CurByte, OS, Fixups, STI);
     CurOp = SrcRegNum + 1;
     break;
 
@@ -1312,6 +1383,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
+    // do not count the rounding control operand
+    if (HasEVEX_RC)
+      NumOps--;
     break;
 
   case X86II::MRMSrcMem: {
@@ -1333,49 +1407,65 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
 
     EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, CurByte, OS, Fixups);
+                     TSFlags, CurByte, OS, Fixups, STI);
     CurOp += AddrOperands + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
     break;
   }
 
+  case X86II::MRMXr:
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r:
+  case X86II::MRM6r: case X86II::MRM7r: {
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
+    uint64_t Form = TSFlags & X86II::FormMask;
     EmitRegModRMByte(MI.getOperand(CurOp++),
-                     (TSFlags & X86II::FormMask)-X86II::MRM0r,
+                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
                      CurByte, OS);
     break;
+  }
+
+  case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRM6m: case X86II::MRM7m: {
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
-    EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
-                     TSFlags, CurByte, OS, Fixups);
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
+                     TSFlags, CurByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
-  case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
-  case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
-  case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_D0:
-  case X86II::MRM_D1: case X86II::MRM_D4: case X86II::MRM_D5:
-  case X86II::MRM_D6: case X86II::MRM_D8: case X86II::MRM_D9:
-  case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
-  case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
-  case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8:
-  case X86II::MRM_F9:
+  }
+  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+  case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
+  case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
+  case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
+  case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
+  case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
+  case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
+  case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
+  case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
     EmitByte(BaseOpcode, CurByte, OS);
 
     unsigned char MRM;
     switch (TSFlags & X86II::FormMask) {
     default: llvm_unreachable("Invalid Form");
+    case X86II::MRM_C0: MRM = 0xC0; break;
     case X86II::MRM_C1: MRM = 0xC1; break;
     case X86II::MRM_C2: MRM = 0xC2; break;
     case X86II::MRM_C3: MRM = 0xC3; break;
@@ -1397,10 +1487,35 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     case X86II::MRM_DD: MRM = 0xDD; break;
     case X86II::MRM_DE: MRM = 0xDE; break;
     case X86II::MRM_DF: MRM = 0xDF; break;
+    case X86II::MRM_E0: MRM = 0xE0; break;
+    case X86II::MRM_E1: MRM = 0xE1; break;
+    case X86II::MRM_E2: MRM = 0xE2; break;
+    case X86II::MRM_E3: MRM = 0xE3; break;
+    case X86II::MRM_E4: MRM = 0xE4; break;
+    case X86II::MRM_E5: MRM = 0xE5; break;
     case X86II::MRM_E8: MRM = 0xE8; break;
+    case X86II::MRM_E9: MRM = 0xE9; break;
+    case X86II::MRM_EA: MRM = 0xEA; break;
+    case X86II::MRM_EB: MRM = 0xEB; break;
+    case X86II::MRM_EC: MRM = 0xEC; break;
+    case X86II::MRM_ED: MRM = 0xED; break;
+    case X86II::MRM_EE: MRM = 0xEE; break;
     case X86II::MRM_F0: MRM = 0xF0; break;
+    case X86II::MRM_F1: MRM = 0xF1; break;
+    case X86II::MRM_F2: MRM = 0xF2; break;
+    case X86II::MRM_F3: MRM = 0xF3; break;
+    case X86II::MRM_F4: MRM = 0xF4; break;
+    case X86II::MRM_F5: MRM = 0xF5; break;
+    case X86II::MRM_F6: MRM = 0xF6; break;
+    case X86II::MRM_F7: MRM = 0xF7; break;
     case X86II::MRM_F8: MRM = 0xF8; break;
     case X86II::MRM_F9: MRM = 0xF9; break;
+    case X86II::MRM_FA: MRM = 0xFA; break;
+    case X86II::MRM_FB: MRM = 0xFB; break;
+    case X86II::MRM_FC: MRM = 0xFC; break;
+    case X86II::MRM_FD: MRM = 0xFD; break;
+    case X86II::MRM_FE: MRM = 0xFE; break;
+    case X86II::MRM_FF: MRM = 0xFF; break;
     }
     EmitByte(MRM, CurByte, OS);
     break;
@@ -1432,17 +1547,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
       EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1,
                     CurByte, OS, Fixups);
     } else {
-      unsigned FixupKind;
-      // FIXME: Is there a better way to know that we need a signed relocation?
-      if (MI.getOpcode() == X86::ADD64ri32 ||
-          MI.getOpcode() == X86::MOV64ri32 ||
-          MI.getOpcode() == X86::MOV64mi32 ||
-          MI.getOpcode() == X86::PUSH64i32)
-        FixupKind = X86::reloc_signed_4byte;
-      else
-        FixupKind = getImmFixupKind(TSFlags);
       EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
-                    X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind),
+                    X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
                     CurByte, OS, Fixups);
     }
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 1cbdafd..09fdb9c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -47,9 +47,12 @@ std::string X86_MC::ParseX86Triple(StringRef TT) {
   Triple TheTriple(TT);
   std::string FS;
   if (TheTriple.getArch() == Triple::x86_64)
-    FS = "+64bit-mode";
+    FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+  else if (TheTriple.getEnvironment() != Triple::CODE16)
+    FS = "-64bit-mode,+32bit-mode,-16bit-mode";
   else
-    FS = "-64bit-mode";
+    FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
   return FS;
 }
 
@@ -202,8 +205,7 @@ unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
 
   if (TheTriple.isOSDarwin())
     return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
-  if (TheTriple.getOS() == Triple::MinGW32 ||
-      TheTriple.getOS() == Triple::Cygwin)
+  if (TheTriple.isOSCygMing())
     // Unsupported by now, just quick fallback
     return DWARFFlavour::X86_32_Generic;
   return DWARFFlavour::X86_32_Generic;
@@ -268,17 +270,17 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   bool is64Bit = TheTriple.getArch() == Triple::x86_64;
 
   MCAsmInfo *MAI;
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+  if (TheTriple.isOSBinFormatMachO()) {
     if (is64Bit)
       MAI = new X86_64MCAsmInfoDarwin(TheTriple);
     else
       MAI = new X86MCAsmInfoDarwin(TheTriple);
-  } else if (TheTriple.getEnvironment() == Triple::ELF) {
+  } else if (TheTriple.isOSBinFormatELF()) {
     // Force the use of an ELF container.
     MAI = new X86ELFMCAsmInfo(TheTriple);
-  } else if (TheTriple.getOS() == Triple::Win32) {
+  } else if (TheTriple.isWindowsMSVCEnvironment()) {
     MAI = new X86MCAsmInfoMicrosoft(TheTriple);
-  } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) {
+  } else if (TheTriple.isOSCygMing()) {
     MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
   } else {
     // The default is ELF.
@@ -358,17 +360,18 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
                                     raw_ostream &_OS,
                                     MCCodeEmitter *_Emitter,
+                                    const MCSubtargetInfo &STI,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+  if (TheTriple.isOSBinFormatMachO())
     return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
 
-  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
     return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
 
-  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
@@ -387,7 +390,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
 static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
                                                    MCContext &Ctx) {
   Triple TheTriple(TT);
-  if (TheTriple.isEnvironmentMachO() && TheTriple.getArch() == Triple::x86_64)
+  if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64)
     return createX86_64MachORelocationInfo(Ctx);
   else if (TheTriple.isOSBinFormatELF())
     return createX86_64ELFRelocationInfo(Ctx);
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index 209b1d0..f2023e3 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -11,8 +11,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Object/MachO.h"
 
 using namespace llvm;
@@ -24,7 +24,7 @@ class X86_64MachORelocationInfo : public MCRelocationInfo {
 public:
   X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
 
-  const MCExpr *createExprForRelocation(RelocationRef Rel) {
+  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
     const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile());
 
     uint64_t RelType; Rel.getType(RelType);
@@ -72,9 +72,9 @@ public:
       break;
     case X86_64_RELOC_SUBTRACTOR:
       {
-        RelocationRef RelNext;
-        Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext);
-        any_relocation_info RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
+        Rel.moveNext();
+        any_relocation_info RENext =
+            Obj->getRelocation(Rel.getRawDataRefImpl());
 
         // X86_64_SUBTRACTOR must be followed by a relocation of type
         // X86_64_RELOC_UNSIGNED.
@@ -86,7 +86,7 @@ public:
 
         const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx);
 
-        symbol_iterator RSymI = RelNext.getSymbol();
+        symbol_iterator RSymI = Rel.getSymbol();
         uint64_t RSymAddr;
         RSymI->getAddress(RSymAddr);
         StringRef RSymName;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index eb7c0b1..1a35ced 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -63,7 +63,7 @@ public:
   void RecordRelocation(MachObjectWriter *Writer,
                         const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) {
+                        MCValue Target, uint64_t &FixedValue) override {
     if (Writer->is64Bit())
       RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                              FixedValue);
@@ -230,7 +230,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     if (Symbol->isInSection()) {
       const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
         Fragment->getParent()->getSection());
-      if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG))
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
         Base = 0;
     }
 
@@ -362,6 +362,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                                                     MCValue Target,
                                                     unsigned Log2Size,
                                                     uint64_t &FixedValue) {
+  uint64_t OriginalFixedValue = FixedValue;
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Type = MachO::GENERIC_RELOC_VANILLA;
@@ -431,8 +432,10 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
     // symbol, things can go badly.
     //
     // Required for 'as' compatibility.
-    if (FixupOffset > 0xffffff)
+    if (FixupOffset > 0xffffff) {
+      FixedValue = OriginalFixedValue;
       return false;
+    }
   }
 
   MachO::any_relocation_info MRE;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 6da4142..ffc9e8d 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -29,9 +29,8 @@ namespace {
     X86WinCOFFObjectWriter(bool Is64Bit_);
     virtual ~X86WinCOFFObjectWriter();
 
-    virtual unsigned getRelocType(const MCValue &Target,
-                                  const MCFixup &Fixup,
-                                  bool IsCrossSection) const LLVM_OVERRIDE;
+    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsCrossSection) const override;
   };
 }
 
@@ -65,6 +64,9 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
     if (Is64Bit)
       return COFF::IMAGE_REL_AMD64_ADDR64;
     llvm_unreachable("unsupported relocation type");
+  case FK_SecRel_2:
+    return Is64Bit ? COFF::IMAGE_REL_AMD64_SECTION
+                   : COFF::IMAGE_REL_I386_SECTION;
   case FK_SecRel_4:
     return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
   default:
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index adfa7fa..71329b0 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -494,11 +494,6 @@ is memory.
 
 //===---------------------------------------------------------------------===//
 
-SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
-sitting between the truncate and the extract.
-
-//===---------------------------------------------------------------------===//
-
 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 any number of 0.0 simultaneously.  Currently we only use it for simple
 insertions.
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index b4285a0..52d3c01 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -123,7 +123,7 @@ flags.
 The instruction selector sometimes misses folding a load into a compare.  The
 pattern is written as (cmp reg, (load p)).  Because the compare isn't 
 commutative, it is not matched with the load on both sides.  The dag combiner
-should be made smart enough to cannonicalize the load into the RHS of a compare
+should be made smart enough to canonicalize the load into the RHS of a compare
 when it can invert the result of the compare for free.
 
 //===---------------------------------------------------------------------===//
@@ -1444,54 +1444,6 @@ it would be nice to produce "into" someday.
 
 //===---------------------------------------------------------------------===//
 
-This code:
-
-void vec_mpys1(int y[], const int x[], int scaler) {
-int i;
-for (i = 0; i < 150; i++)
- y[i] += (((long long)scaler * (long long)x[i]) >> 31);
-}
-
-Compiles to this loop with GCC 3.x:
-
-.L5:
-	movl	%ebx, %eax
-	imull	(%edi,%ecx,4)
-	shrdl	$31, %edx, %eax
-	addl	%eax, (%esi,%ecx,4)
-	incl	%ecx
-	cmpl	$149, %ecx
-	jle	.L5
-
-llvm-gcc compiles it to the much uglier:
-
-LBB1_1:	## bb1
-	movl	24(%esp), %eax
-	movl	(%eax,%edi,4), %ebx
-	movl	%ebx, %ebp
-	imull	%esi, %ebp
-	movl	%ebx, %eax
-	mull	%ecx
-	addl	%ebp, %edx
-	sarl	$31, %ebx
-	imull	%ecx, %ebx
-	addl	%edx, %ebx
-	shldl	$1, %eax, %ebx
-	movl	20(%esp), %eax
-	addl	%ebx, (%eax,%edi,4)
-	incl	%edi
-	cmpl	$150, %edi
-	jne	LBB1_1	## bb1
-
-The issue is that we hoist the cast of "scaler" to long long outside of the
-loop, the value comes into the loop as two values, and
-RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
-constructed BUILD_PAIR which represents the cast value.
-
-This can be handled by making CodeGenPrepare sink the cast.
-
-//===---------------------------------------------------------------------===//
-
 Test instructions can be eliminated by using EFLAGS values from arithmetic
 instructions. This is currently not done for mul, and, or, xor, neg, shl,
 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
diff --git a/lib/Target/X86/TargetInfo/Android.mk b/lib/Target/X86/TargetInfo/Android.mk
index 1c53475..1a6c902 100644
--- a/lib/Target/X86/TargetInfo/Android.mk
+++ b/lib/Target/X86/TargetInfo/Android.mk
@@ -32,6 +32,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 include $(CLEAR_TBLGEN_VARS)
 
@@ -51,3 +52,4 @@ LOCAL_MODULE_TAGS := optional
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_TBLGEN_RULES_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt
index b1d0b9f..1d8a8c1 100644
--- a/lib/Target/X86/TargetInfo/CMakeLists.txt
+++ b/lib/Target/X86/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMX86Info
   X86TargetInfo.cpp
   )
-
-add_dependencies(LLVMX86Info X86CommonTableGen)
diff --git a/lib/Target/X86/TargetInfo/LLVMBuild.txt b/lib/Target/X86/TargetInfo/LLVMBuild.txt
index 3c64a22..6a52ea6 100644
--- a/lib/Target/X86/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/X86/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = X86Info
 parent = X86
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = X86
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 815d235..1ea8798 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -7,8 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86.h"
-#include "llvm/IR/Module.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
diff --git a/lib/Target/X86/Utils/Android.mk b/lib/Target/X86/Utils/Android.mk
index d9dd670..e0c4797 100644
--- a/lib/Target/X86/Utils/Android.mk
+++ b/lib/Target/X86/Utils/Android.mk
@@ -5,6 +5,7 @@ x86_utils_SRC_FILES := \
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(x86_utils_SRC_FILES)
@@ -17,6 +18,7 @@ LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_DEVICE_BUILD_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
 
 # For the host
 # =====================================================
diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt
index 2e72c34..b269746 100644
--- a/lib/Target/X86/Utils/CMakeLists.txt
+++ b/lib/Target/X86/Utils/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMX86Utils
   X86ShuffleDecode.cpp
   )
-
-add_dependencies(LLVMX86Utils X86CommonTableGen)
diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt
index de0a30f..fdb886f 100644
--- a/lib/Target/X86/Utils/LLVMBuild.txt
+++ b/lib/Target/X86/Utils/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = X86Utils
 parent = X86
-required_libraries = Core Support
+required_libraries = Support
 add_to_library_groups = X86
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index bbd4904..5f2441c 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 017ab32..9e75b6b 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -16,13 +16,14 @@
 #define X86_SHUFFLE_DECODE_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/ValueTypes.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
+class MVT;
+
 enum {
   SM_SentinelZero = -1
 };
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 947002f..18e6845 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -15,14 +15,12 @@
 #ifndef TARGET_X86_H
 #define TARGET_X86_H
 
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CodeGen.h"
 
 namespace llvm {
 
 class FunctionPass;
+class ImmutablePass;
 class JITCodeEmitter;
 class X86TargetMachine;
 
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 65c5552..78edcf0 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -22,6 +22,10 @@ include "llvm/Target/Target.td"
 
 def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
                                   "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+                                  "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+                                  "16-bit mode (i8086)">;
 
 //===----------------------------------------------------------------------===//
 // X86 Subtarget features
@@ -73,6 +77,8 @@ def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
                                       [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
                                         "IsUAMemFast", "true",
                                         "Fast unaligned memory access">;
@@ -221,7 +227,7 @@ def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
                                FeaturePCLMUL, FeatureAES,
                                FeatureCallRegIndirect,
                                FeaturePRFCHW,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureFastUAMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
                      [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
@@ -268,46 +274,53 @@ def : ProcessorModel<"knl", HaswellModel,
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
 def : Proc<"k6-3",            [Feature3DNow]>;
-def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem, 
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"amdfam10",        [FeatureSSE4A,
                                Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureSlowBTMem]>;
+                               FeaturePOPCNT, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowSHLD]>;
 // Jaguar
 def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                                FeatureBMI, FeatureF16C, FeatureMOVBE,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
-                               FeatureFMA]>;
+                               FeaturePOPCNT, FeatureBMI, FeatureTBM,
+                               FeatureFMA, FeatureSlowSHLD]>;
 
 // Steamroller
 def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
@@ -387,12 +400,10 @@ def IntelAsmParserVariant : AsmParserVariant {
 def ATTAsmWriter : AsmWriter {
   string AsmWriterClassName  = "ATTInstPrinter";
   int Variant = 0;
-  bit isMCAsmWriter = 1;
 }
 def IntelAsmWriter : AsmWriter {
   string AsmWriterClassName  = "IntelInstPrinter";
   int Variant = 1;
-  bit isMCAsmWriter = 1;
 }
 
 def X86 : Target {
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 1258441..fb66acc 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -14,18 +14,17 @@
 
 #include "X86AsmPrinter.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
-#include "X86.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86COFFMachineModuleInfo.h"
+#include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
-#include "X86TargetMachine.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -38,8 +37,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -51,7 +48,7 @@ using namespace llvm;
 bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   SetupMachineFunction(MF);
 
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) {
+  if (Subtarget->isTargetCOFF()) {
     bool Intrn = MF.getFunction()->hasInternalLinkage();
     OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
     OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
@@ -74,56 +71,55 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 /// printSymbolOperand - Print a raw symbol reference operand.  This handles
 /// jump tables, constant pools, global address and external symbols, all of
 /// which print to a label with various suffixes for relocation types etc.
-void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
-                                       raw_ostream &O) {
+static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
+                               raw_ostream &O) {
   switch (MO.getType()) {
   default: llvm_unreachable("unknown symbol type!");
-  case MachineOperand::MO_JumpTableIndex:
-    O << *GetJTISymbol(MO.getIndex());
-    break;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << *GetCPISymbol(MO.getIndex());
-    printOffset(MO.getOffset(), O);
+    O << *P.GetCPISymbol(MO.getIndex());
+    P.printOffset(MO.getOffset(), O);
     break;
   case MachineOperand::MO_GlobalAddress: {
     const GlobalValue *GV = MO.getGlobal();
 
     MCSymbol *GVSym;
     if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
-      GVSym = GetSymbolWithGlobalValueBase(GV, "$stub");
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$stub");
     else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
              MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
              MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
-      GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
     else
-      GVSym = getSymbol(GV);
+      GVSym = P.getSymbol(GV);
 
     // Handle dllimport linkage.
     if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
-      GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+      GVSym =
+          P.OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
 
     if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
         MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
-      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
-        MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
-      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
-        MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
+              Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
-      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
       MachineModuleInfoImpl::StubValueTy &StubSym =
-        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     }
 
     // If the name begins with a dollar-sign, enclose it in parens.  We do this
@@ -132,36 +128,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
       O << *GVSym;
     else
       O << '(' << *GVSym << ')';
-    printOffset(MO.getOffset(), O);
-    break;
-  }
-  case MachineOperand::MO_ExternalSymbol: {
-    const MCSymbol *SymToPrint;
-    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
-      SmallString<128> TempNameStr;
-      TempNameStr += StringRef(MO.getSymbolName());
-      TempNameStr += StringRef("$stub");
-
-      MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str());
-      MachineModuleInfoImpl::StubValueTy &StubSym =
-        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
-      if (StubSym.getPointer() == 0) {
-        TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end());
-        StubSym = MachineModuleInfoImpl::
-          StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()),
-                      true);
-      }
-      SymToPrint = StubSym.getPointer();
-    } else {
-      SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName());
-    }
-
-    // If the name begins with a dollar-sign, enclose it in parens.  We do this
-    // to avoid having it look like an integer immediate to the assembler.
-    if (SymToPrint->getName()[0] != '$')
-      O << *SymToPrint;
-    else
-      O << '(' << *SymToPrint << '(';
+    P.printOffset(MO.getOffset(), O);
     break;
   }
   }
@@ -177,12 +144,12 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
     // These affect the name of the symbol, not any suffix.
     break;
   case X86II::MO_GOT_ABSOLUTE_ADDRESS:
-    O << " + [.-" << *MF->getPICBaseSymbol() << ']';
+    O << " + [.-" << *P.MF->getPICBaseSymbol() << ']';
     break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
-    O << '-' << *MF->getPICBaseSymbol();
+    O << '-' << *P.MF->getPICBaseSymbol();
     break;
   case X86II::MO_TLSGD:     O << "@TLSGD";     break;
   case X86II::MO_TLSLD:     O << "@TLSLD";     break;
@@ -199,41 +166,40 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
   case X86II::MO_PLT:       O << "@PLT";       break;
   case X86II::MO_TLVP:      O << "@TLVP";      break;
   case X86II::MO_TLVP_PIC_BASE:
-    O << "@TLVP" << '-' << *MF->getPICBaseSymbol();
+    O << "@TLVP" << '-' << *P.MF->getPICBaseSymbol();
     break;
   case X86II::MO_SECREL:    O << "@SECREL32";  break;
   }
 }
 
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O,
+                         const char *Modifier = 0, unsigned AsmVariant = 0);
+
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.  These print slightly differently, for
 /// example, a $ is not emitted.
-void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo,
-                                    raw_ostream &O) {
+static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
+                          unsigned OpNo, raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("Unknown pcrel immediate operand");
   case MachineOperand::MO_Register:
     // pc-relativeness was handled when computing the value in the reg.
-    printOperand(MI, OpNo, O);
+    printOperand(P, MI, OpNo, O);
     return;
   case MachineOperand::MO_Immediate:
     O << MO.getImm();
     return;
-  case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
-    return;
   case MachineOperand::MO_GlobalAddress:
-  case MachineOperand::MO_ExternalSymbol:
-    printSymbolOperand(MO, O);
+    printSymbolOperand(P, MO, O);
     return;
   }
 }
 
-
-void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
-                                 raw_ostream &O, const char *Modifier,
-                                 unsigned AsmVariant) {
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O, const char *Modifier,
+                         unsigned AsmVariant) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("unknown operand type!");
@@ -256,22 +222,20 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     O << MO.getImm();
     return;
 
-  case MachineOperand::MO_JumpTableIndex:
-  case MachineOperand::MO_ConstantPoolIndex:
-  case MachineOperand::MO_GlobalAddress:
-  case MachineOperand::MO_ExternalSymbol: {
+  case MachineOperand::MO_GlobalAddress: {
     if (AsmVariant == 0) O << '$';
-    printSymbolOperand(MO, O);
+    printSymbolOperand(P, MO, O);
     break;
   }
   }
 }
 
-void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
-                                         raw_ostream &O, const char *Modifier) {
-  const MachineOperand &BaseReg  = MI->getOperand(Op);
-  const MachineOperand &IndexReg = MI->getOperand(Op+2);
-  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                 unsigned Op, raw_ostream &O,
+                                 const char *Modifier = NULL) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
 
   // If we really don't want to print out (rip), don't.
   bool HasBaseReg = BaseReg.getReg() != 0;
@@ -282,14 +246,18 @@ void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
   // HasParenPart - True if we will print out the () part of the mem ref.
   bool HasParenPart = IndexReg.getReg() || HasBaseReg;
 
-  if (DispSpec.isImm()) {
+  switch (DispSpec.getType()) {
+  default:
+    llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Immediate: {
     int DispVal = DispSpec.getImm();
     if (DispVal || !HasParenPart)
       O << DispVal;
-  } else {
-    assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
-           DispSpec.isJTI() || DispSpec.isSymbol());
-    printSymbolOperand(MI->getOperand(Op+3), O);
+    break;
+  }
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+    printSymbolOperand(P, DispSpec, O);
   }
 
   if (Modifier && strcmp(Modifier, "H") == 0)
@@ -301,12 +269,12 @@ void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
 
     O << '(';
     if (HasBaseReg)
-      printOperand(MI, Op, O, Modifier);
+      printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
 
     if (IndexReg.getReg()) {
       O << ',';
-      printOperand(MI, Op+2, O, Modifier);
-      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
       if (ScaleVal != 1)
         O << ',' << ScaleVal;
     }
@@ -314,29 +282,31 @@ void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
   }
 }
 
-void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
-                                      raw_ostream &O, const char *Modifier) {
+static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                              unsigned Op, raw_ostream &O,
+                              const char *Modifier = NULL) {
   assert(isMem(MI, Op) && "Invalid memory reference!");
-  const MachineOperand &Segment = MI->getOperand(Op+4);
+  const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
   if (Segment.getReg()) {
-    printOperand(MI, Op+4, O, Modifier);
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
     O << ':';
   }
-  printLeaMemReference(MI, Op, O, Modifier);
+  printLeaMemReference(P, MI, Op, O, Modifier);
 }
 
-void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
-                                           raw_ostream &O, const char *Modifier,
-                                           unsigned AsmVariant){
-  const MachineOperand &BaseReg  = MI->getOperand(Op);
-  unsigned ScaleVal = MI->getOperand(Op+1).getImm();
-  const MachineOperand &IndexReg = MI->getOperand(Op+2);
-  const MachineOperand &DispSpec = MI->getOperand(Op+3);
-  const MachineOperand &SegReg   = MI->getOperand(Op+4);
+static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                   unsigned Op, raw_ostream &O,
+                                   const char *Modifier = NULL,
+                                   unsigned AsmVariant = 1) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MachineOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
 
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
-    printOperand(MI, Op+4, O, Modifier, AsmVariant);
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
     O << ':';
   }
 
@@ -344,7 +314,7 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
 
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
-    printOperand(MI, Op, O, Modifier, AsmVariant);
+    printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
     NeedPlus = true;
   }
 
@@ -352,13 +322,13 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
       O << ScaleVal << '*';
-    printOperand(MI, Op+2, O, Modifier, AsmVariant);
+    printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
     NeedPlus = true;
   }
 
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
-    printOperand(MI, Op+3, O, Modifier, AsmVariant);
+    printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
   } else {
     int64_t DispVal = DispSpec.getImm();
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
@@ -376,8 +346,8 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
   O << ']';
 }
 
-bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
-                                      raw_ostream &O) {
+static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+                              char Mode, raw_ostream &O) {
   unsigned Reg = MO.getReg();
   switch (Mode) {
   default: return true;  // Unknown mode.
@@ -393,9 +363,11 @@ bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
   case 'k': // Print SImode register
     Reg = getX86SubSuperRegister(Reg, MVT::i32);
     break;
-  case 'q': // Print DImode register
-    // FIXME: gcc will actually print e instead of r for 32-bit.
-    Reg = getX86SubSuperRegister(Reg, MVT::i64);
+  case 'q':
+    // Print 64-bit register names if 64-bit integer registers are available.
+    // Otherwise, print 32-bit register names.
+    MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32;
+    Reg = getX86SubSuperRegister(Reg, Ty);
     break;
   }
 
@@ -419,37 +391,50 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       // See if this is a generic print operand
       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
-      if (MO.isImm()) {
+      switch (MO.getType()) {
+      default:
+        return true;
+      case MachineOperand::MO_Immediate:
         O << MO.getImm();
         return false;
-      }
-      if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
-        printSymbolOperand(MO, O);
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
         if (Subtarget->isPICStyleRIPRel())
           O << "(%rip)";
         return false;
-      }
-      if (MO.isReg()) {
+      case MachineOperand::MO_Register:
         O << '(';
-        printOperand(MI, OpNo, O);
+        printOperand(*this, MI, OpNo, O);
         O << ')';
         return false;
       }
-      return true;
 
     case 'c': // Don't print "$" before a global var name or constant.
-      if (MO.isImm())
+      switch (MO.getType()) {
+      default:
+        printOperand(*this, MI, OpNo, O);
+        break;
+      case MachineOperand::MO_Immediate:
         O << MO.getImm();
-      else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol())
-        printSymbolOperand(MO, O);
-      else
-        printOperand(MI, OpNo, O);
+        break;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        break;
+      }
       return false;
 
     case 'A': // Print '*' before a register (it must be a register)
       if (MO.isReg()) {
         O << '*';
-        printOperand(MI, OpNo, O);
+        printOperand(*this, MI, OpNo, O);
         return false;
       }
       return true;
@@ -460,12 +445,12 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     case 'k': // Print SImode register
     case 'q': // Print DImode register
       if (MO.isReg())
-        return printAsmMRegister(MO, ExtraCode[0], O);
-      printOperand(MI, OpNo, O);
+        return printAsmMRegister(*this, MO, ExtraCode[0], O);
+      printOperand(*this, MI, OpNo, O);
       return false;
 
     case 'P': // This is the operand of a call, treat specially.
-      printPCRelImm(MI, OpNo, O);
+      printPCRelImm(*this, MI, OpNo, O);
       return false;
 
     case 'n':  // Negate the immediate or print a '-' before the operand.
@@ -479,7 +464,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     }
   }
 
-  printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
+  printOperand(*this, MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
   return false;
 }
 
@@ -488,7 +473,7 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
   if (AsmVariant) {
-    printIntelMemReference(MI, OpNo, O);
+    printIntelMemReference(*this, MI, OpNo, O);
     return false;
   }
 
@@ -505,19 +490,19 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
       // These only apply to registers, ignore on mem.
       break;
     case 'H':
-      printMemReference(MI, OpNo, O, "H");
+      printMemReference(*this, MI, OpNo, O, "H");
       return false;
     case 'P': // Don't print @PLT, but do print as memory.
-      printMemReference(MI, OpNo, O, "no-rip");
+      printMemReference(*this, MI, OpNo, O, "no-rip");
       return false;
     }
   }
-  printMemReference(MI, OpNo, O);
+  printMemReference(*this, MI, OpNo, O);
   return false;
 }
 
 void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (Subtarget->isTargetEnvMacho())
+  if (Subtarget->isTargetMacho())
     OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 
   if (Subtarget->isTargetCOFF()) {
@@ -544,7 +529,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 
 void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetEnvMacho()) {
+  if (Subtarget->isTargetMacho()) {
     // All darwin targets use mach-o.
     MachineModuleInfoMachO &MMIMacho =
       MMI->getObjFileInfo<MachineModuleInfoMachO>();
@@ -556,9 +541,9 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (!Stubs.empty()) {
       const MCSection *TheSection =
         OutContext.getMachOSection("__IMPORT", "__jump_table",
-                                   MCSectionMachO::S_SYMBOL_STUBS |
-                                   MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE |
-                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   MachO::S_SYMBOL_STUBS |
+                                   MachO::S_ATTR_SELF_MODIFYING_CODE |
+                                   MachO::S_ATTR_PURE_INSTRUCTIONS,
                                    5, SectionKind::getMetadata());
       OutStreamer.SwitchSection(TheSection);
 
@@ -582,7 +567,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (!Stubs.empty()) {
       const MCSection *TheSection =
         OutContext.getMachOSection("__IMPORT", "__pointers",
-                                   MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                   MachO::S_NON_LAZY_SYMBOL_POINTERS,
                                    SectionKind::getMetadata());
       OutStreamer.SwitchSection(TheSection);
 
@@ -638,14 +623,13 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
-  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() &&
-      MMI->usesVAFloatArgument()) {
+  if (Subtarget->isTargetKnownWindowsMSVC() && MMI->usesVAFloatArgument()) {
     StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused";
     MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName);
     OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
   }
 
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) {
+  if (Subtarget->isTargetCOFF()) {
     X86COFFMachineModuleInfo &COFFMMI =
       MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
 
@@ -664,29 +648,44 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     // Necessary for dllexport support
     std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
 
-    const TargetLoweringObjectFileCOFF &TLOFCOFF =
-      static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
-
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-      if (I->hasDLLExportLinkage())
+      if (I->hasDLLExportStorageClass())
         DLLExportedFns.push_back(getSymbol(I));
 
     for (Module::const_global_iterator I = M.global_begin(),
            E = M.global_end(); I != E; ++I)
-      if (I->hasDLLExportLinkage())
+      if (I->hasDLLExportStorageClass())
         DLLExportedGlobals.push_back(getSymbol(I));
 
+    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+                                      I != E; ++I) {
+      const GlobalValue *GV = I;
+      if (!GV->hasDLLExportStorageClass())
+        continue;
+
+      while (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
+        GV = A->getAliasedGlobal();
+
+      if (isa<Function>(GV))
+        DLLExportedFns.push_back(getSymbol(I));
+      else if (isa<GlobalVariable>(GV))
+        DLLExportedGlobals.push_back(getSymbol(I));
+    }
+
     // Output linker support code for dllexported globals on windows.
     if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+      const TargetLoweringObjectFileCOFF &TLOFCOFF =
+        static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
       OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
       SmallString<128> name;
       for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
-        if (Subtarget->isTargetWindows())
+        if (Subtarget->isTargetKnownWindowsMSVC())
           name = " /EXPORT:";
         else
           name = " -export:";
         name += DLLExportedGlobals[i]->getName();
-        if (Subtarget->isTargetWindows())
+        if (Subtarget->isTargetKnownWindowsMSVC())
           name += ",DATA";
         else
         name += ",data";
@@ -694,7 +693,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       }
 
       for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
-        if (Subtarget->isTargetWindows())
+        if (Subtarget->isTargetKnownWindowsMSVC())
           name = " /EXPORT:";
         else
           name = " -export:";
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 24a768b..3308cc2 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -10,14 +10,10 @@
 #ifndef X86ASMPRINTER_H
 #define X86ASMPRINTER_H
 
-#include "X86.h"
-#include "X86MachineFunctionInfo.h"
-#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -27,59 +23,32 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
   StackMaps SM;
 
-  // Parses operands of PATCHPOINT and STACKMAP to produce stack map Location
-  // structures. Returns a result location and an iterator to the operand
-  // immediately following the operands consumed.
-  //
-  // This method is implemented in X86MCInstLower.cpp.
-  static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
-    stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
-                          MachineInstr::const_mop_iterator MOE,
-                          const TargetMachine &TM);
-
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), SM(*this, stackmapOperandParser) {
+    : AsmPrinter(TM, Streamer), SM(*this) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
-  virtual const char *getPassName() const LLVM_OVERRIDE {
+  const char *getPassName() const override {
     return "X86 Assembly / Object Emitter";
   }
 
   const X86Subtarget &getSubtarget() const { return *Subtarget; }
 
-  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
-
-  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
-
-  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
-
-  void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
-
-  // These methods are used by the tablegen'erated instruction printer.
-  void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0, unsigned AsmVariant = 0);
-  void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+  void EmitStartOfAsmFile(Module &M) override;
 
-  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                               unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &OS) LLVM_OVERRIDE;
-  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                                     unsigned AsmVariant, const char *ExtraCode,
-                                     raw_ostream &OS) LLVM_OVERRIDE;
+  void EmitEndOfAsmFile(Module &M) override;
 
-  void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
-                         const char *Modifier=NULL);
-  void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
-                            const char *Modifier=NULL);
+  void EmitInstruction(const MachineInstr *MI) override;
 
-  void printIntelMemReference(const MachineInstr *MI, unsigned Op,
-                              raw_ostream &O, const char *Modifier=NULL,
-                              unsigned AsmVariant = 1);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
 
-  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
+  bool runOnMachineFunction(MachineFunction &F) override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index e76f9fd..040da35 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -29,6 +29,33 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
   return false;
 }
 
+inline bool CC_X86_CDeclMethod_SRet(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // Swap the order of the first two parameters if the first parameter is sret.
+  if (ArgFlags.isSRet()) {
+    assert(ValNo == 0);
+    assert(ValVT == MVT::i32);
+    State.AllocateStack(8, 4);
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 4, LocVT, LocInfo));
+
+    // Indicate that we need to swap the order of the first and second
+    // parameters by "allocating" register zero.  There are no register
+    // parameters with cdecl methods, so we can use this to communicate to the
+    // next call.
+    State.AllocateReg(1);
+    return true;
+  } else if (ValNo == 1 && State.isAllocated(1)) {
+    assert(ValVT == MVT::i32 && "non-i32-sized this param unsupported");
+    // Stack was already allocated while processing sret.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 0, LocVT, LocInfo));
+    return true;
+  }
+
+  // All other args use the C calling convention.
+  return false;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index a78b5c0..1cfd827 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -357,9 +357,16 @@ def CC_X86_64_WebKit_JS : CallingConv<[
   // Promote i8/i16 arguments to i32.
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
-  // Integer/FP values are always stored in stack slots that are 8 bytes in size
-  // and 8-byte aligned.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+  // Only the first integer argument is passed in register.
+  CCIfType<[i32], CCAssignToReg<[EAX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+  // The remaining integer arguments are passed on the stack. 32bit integer and
+  // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+  // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+  // in 8 byte stack slots.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
 
 // No explicit register is specified for the AnyReg calling convention. The
@@ -453,20 +460,45 @@ def CC_X86_32_FastCall : CallingConv<[
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
-def CC_X86_32_ThisCall : CallingConv<[
+def CC_X86_32_ThisCall_Common : CallingConv<[
+  // The first integer argument is passed in ECX
+  CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
   // Promote i8/i16 arguments to i32.
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
   // Pass sret arguments indirectly through stack.
   CCIfSRet<CCAssignToStack<4, 4>>,
 
-  // The first integer argument is passed in ECX
-  CCIfType<[i32], CCAssignToReg<[ECX]>>,
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_CDeclMethod : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  CCCustom<"CC_X86_CDeclMethod_SRet">,
 
-  // Otherwise, same as everything else.
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
+def CC_X86_32_ThisCall : CallingConv<[
+  CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+  CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
 def CC_X86_32_FastCC : CallingConv<[
   // Handles byval parameters.  Note that we can't rely on the delegation
   // to CC_X86_32_Common for this because that happens after code that
@@ -551,6 +583,7 @@ def CC_Intel_OCL_BI : CallingConv<[
 def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+  CCIfCC<"CallingConv::X86_CDeclMethod", CCDelegateTo<CC_X86_CDeclMethod>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
@@ -597,10 +630,26 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
 
-def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+                                              R8, R9, R10, RSP)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs     : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
                                            R11, R12, R13, R14, R15, RBP,
                                            (sequence "XMM%u", 0, 15))>;
 
+def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
+                                              (sequence "XMM%u", 16, 31))>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
+                                                   (sequence "YMM%u", 0, 31)),
+                                              (sequence "XMM%u", 0, 15))>;
+
 // Standard C + YMM6-15
 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
                                                   R13, R14, R15,
@@ -618,6 +667,6 @@ def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
 def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
                                                   (sequence "YMM%u", 8, 15))>;
 
-def CSR_64_Intel_OCL_BI_AVX512    : CalleeSavedRegs<(add CSR_64,
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
                                                   (sequence "ZMM%u", 16, 31),
                                                   K4, K5, K6, K7)>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 14385ed..f6c4c2e 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -56,9 +56,9 @@ namespace {
         MCE(mce), PICBaseOffset(0), Is64BitMode(false),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
-    bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "X86 Machine Code Emitter";
     }
 
@@ -76,7 +76,7 @@ namespace {
 
     void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
       AU.addRequired<MachineModuleInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -186,10 +186,6 @@ static unsigned determineREX(const MachineInstr &MI) {
     }
 
     switch (Desc.TSFlags & X86II::FormMask) {
-      case X86II::MRMInitReg:
-        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
-          REX |= (1 << 0) | (1 << 2);
-        break;
       case X86II::MRMSrcReg: {
         if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
           REX |= 1 << 2;
@@ -216,6 +212,7 @@ static unsigned determineREX(const MachineInstr &MI) {
         }
         break;
       }
+      case X86II::MRMXm:
       case X86II::MRM0m: case X86II::MRM1m:
       case X86II::MRM2m: case X86II::MRM3m:
       case X86II::MRM4m: case X86II::MRM5m:
@@ -658,67 +655,20 @@ void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
                                             int MemOperand,
                                             const MachineInstr &MI,
                                             const MCInstrDesc *Desc) const {
-  // Emit the lock opcode prefix as needed.
-  if (Desc->TSFlags & X86II::LOCK)
-    MCE.emitByte(0xF0);
-
-  // Emit segment override opcode prefix as needed.
-  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
-
-  // Emit the repeat opcode prefix as needed.
-  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
-    MCE.emitByte(0xF3);
-
-  // Emit the address size opcode prefix as needed.
-  bool need_address_override;
-  if (TSFlags & X86II::AdSize) {
-    need_address_override = true;
-  } else if (MemOperand == -1) {
-    need_address_override = false;
-  } else if (Is64BitMode) {
-    assert(!Is16BitMemOperand(MI, MemOperand));
-    need_address_override = Is32BitMemOperand(MI, MemOperand);
-  } else {
-    assert(!Is64BitMemOperand(MI, MemOperand));
-    need_address_override = Is16BitMemOperand(MI, MemOperand);
-  }
-
-  if (need_address_override)
-    MCE.emitByte(0x67);
-
   // Emit the operand size opcode prefix as needed.
-  if (TSFlags & X86II::OpSize)
+  if (((TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift) == X86II::OpSize16)
     MCE.emitByte(0x66);
 
-  bool Need0FPrefix = false;
-  switch (Desc->TSFlags & X86II::Op0Mask) {
-    case X86II::TB:  // Two-byte opcode prefix
-    case X86II::T8:  // 0F 38
-    case X86II::TA:  // 0F 3A
-    case X86II::A6:  // 0F A6
-    case X86II::A7:  // 0F A7
-      Need0FPrefix = true;
-      break;
-    case X86II::REP: break; // already handled.
-    case X86II::T8XS: // F3 0F 38
-    case X86II::XS:   // F3 0F
-      MCE.emitByte(0xF3);
-      Need0FPrefix = true;
-      break;
-    case X86II::T8XD: // F2 0F 38
-    case X86II::TAXD: // F2 0F 3A
-    case X86II::XD:   // F2 0F
-      MCE.emitByte(0xF2);
-      Need0FPrefix = true;
-      break;
-    case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
-    case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
-      MCE.emitByte(0xD8+
-                   (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
-                    >> X86II::Op0Shift));
-      break; // Two-byte opcode prefix
-    default: llvm_unreachable("Invalid prefix!");
-    case 0: break;  // No prefix!
+  switch (Desc->TSFlags & X86II::OpPrefixMask) {
+  case X86II::PD:   // 66
+    MCE.emitByte(0x66);
+    break;
+  case X86II::XS:   // F3
+    MCE.emitByte(0xF3);
+    break;
+  case X86II::XD:   // F2
+    MCE.emitByte(0xF2);
+    break;
   }
 
   // Handle REX prefix.
@@ -728,25 +678,21 @@ void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
   }
 
   // 0x0F escape code must be emitted just before the opcode.
-  if (Need0FPrefix)
+  switch (Desc->TSFlags & X86II::OpMapMask) {
+  case X86II::TB:  // Two-byte opcode map
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
     MCE.emitByte(0x0F);
+    break;
+  }
 
-  switch (Desc->TSFlags & X86II::Op0Mask) {
-    case X86II::T8XD:  // F2 0F 38
-    case X86II::T8XS:  // F3 0F 38
-    case X86II::T8:    // 0F 38
-      MCE.emitByte(0x38);
-      break;
-    case X86II::TAXD:  // F2 0F 38
-    case X86II::TA:    // 0F 3A
-      MCE.emitByte(0x3A);
-      break;
-    case X86II::A6:    // 0F A6
-      MCE.emitByte(0xA6);
-      break;
-    case X86II::A7:    // 0F A7
-      MCE.emitByte(0xA7);
-      break;
+  switch (Desc->TSFlags & X86II::OpMapMask) {
+  case X86II::T8:    // 0F 38
+    MCE.emitByte(0x38);
+    break;
+  case X86II::TA:    // 0F 3A
+    MCE.emitByte(0x3A);
+    break;
   }
 }
 
@@ -778,29 +724,19 @@ template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
                                                  int MemOperand,
                                                  const MachineInstr &MI) const {
-  switch (TSFlags & X86II::SegOvrMask) {
-    default: llvm_unreachable("Invalid segment!");
-    case 0:
-      // No segment override, check for explicit one on memory operand.
-      if (MemOperand != -1) {   // If the instruction has a memory operand.
-        switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
-          default: llvm_unreachable("Unknown segment register!");
-          case 0: break;
-          case X86::CS: MCE.emitByte(0x2E); break;
-          case X86::SS: MCE.emitByte(0x36); break;
-          case X86::DS: MCE.emitByte(0x3E); break;
-          case X86::ES: MCE.emitByte(0x26); break;
-          case X86::FS: MCE.emitByte(0x64); break;
-          case X86::GS: MCE.emitByte(0x65); break;
-        }
-      }
-      break;
-    case X86II::FS:
-      MCE.emitByte(0x64);
-      break;
-    case X86II::GS:
-      MCE.emitByte(0x65);
-      break;
+  if (MemOperand < 0)
+    return; // No memory operand
+
+  // Check for explicit segment override on memory operand.
+  switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+  default: llvm_unreachable("Unknown segment register!");
+  case 0: break;
+  case X86::CS: MCE.emitByte(0x2E); break;
+  case X86::SS: MCE.emitByte(0x36); break;
+  case X86::DS: MCE.emitByte(0x3E); break;
+  case X86::ES: MCE.emitByte(0x26); break;
+  case X86::FS: MCE.emitByte(0x64); break;
+  case X86::GS: MCE.emitByte(0x65); break;
   }
 }
 
@@ -809,6 +745,8 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
                                                int MemOperand,
                                                const MachineInstr &MI,
                                                const MCInstrDesc *Desc) const {
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
   bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
@@ -839,9 +777,6 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   // opcode extension, or ignored, depending on the opcode byte)
   unsigned char VEX_W = 0;
 
-  // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  bool XOP = false;
-
   // VEX_5M (VEX m-mmmmm field):
   //
   //  0b00000: Reserved for future use
@@ -852,7 +787,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   //  0b01000: XOP map select - 08h instructions with imm byte
   //  0b01001: XOP map select - 09h instructions with no imm byte
   //  0b01010: XOP map select - 0Ah instructions with imm dword
-  unsigned char VEX_5M = 0x1;
+  unsigned char VEX_5M = 0;
 
   // VEX_4V (VEX vvvv field): a register specifier
   // (in 1's complement form) or 1111 if unused.
@@ -875,58 +810,28 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   //
   unsigned char VEX_PP = 0;
 
-  // Encode the operand size opcode prefix as needed.
-  if (TSFlags & X86II::OpSize)
-    VEX_PP = 0x01;
-
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
     VEX_W = 1;
 
-  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = true;
-
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
 
-  switch (TSFlags & X86II::Op0Mask) {
-    default: llvm_unreachable("Invalid prefix!");
-    case X86II::T8:  // 0F 38
-      VEX_5M = 0x2;
-      break;
-    case X86II::TA:  // 0F 3A
-      VEX_5M = 0x3;
-      break;
-    case X86II::T8XS: // F3 0F 38
-      VEX_PP = 0x2;
-      VEX_5M = 0x2;
-      break;
-    case X86II::T8XD: // F2 0F 38
-      VEX_PP = 0x3;
-      VEX_5M = 0x2;
-      break;
-    case X86II::TAXD: // F2 0F 3A
-      VEX_PP = 0x3;
-      VEX_5M = 0x3;
-      break;
-    case X86II::XS:  // F3 0F
-      VEX_PP = 0x2;
-      break;
-    case X86II::XD:  // F2 0F
-      VEX_PP = 0x3;
-      break;
-    case X86II::XOP8:
-      VEX_5M = 0x8;
-      break;
-    case X86II::XOP9:
-      VEX_5M = 0x9;
-      break;
-    case X86II::XOPA:
-      VEX_5M = 0xA;
-      break;
-    case X86II::TB: // VEX_5M/VEX_PP already correct
-      break;
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: break; // VEX_PP already correct
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
   }
 
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
 
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc->getNumOperands();
@@ -941,17 +846,8 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   }
 
   switch (TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:
-      // Duplicate register.
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_R = 0x0;
-
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_B = 0x0;
-      if (HasVEX_4VOp3)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+    default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+    case X86II::RawFrm:
       break;
     case X86II::MRMDestMem: {
       // MRMDestMem instructions forms:
@@ -1069,8 +965,6 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_B = 0x0;
       break;
-    default: // RawFrm
-      break;
   }
 
   // Emit segment override opcode prefix as needed.
@@ -1087,16 +981,21 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   //    | C5h | | R | vvvv | L | pp |
   //    +-----+ +-------------------+
   //
+  //  XOP uses a similar prefix:
+  //    +-----+ +--------------+ +-------------------+
+  //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
   unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
 
-  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+  // Can this use the 2 byte VEX prefix?
+  if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
     MCE.emitByte(0xC5);
     MCE.emitByte(LastByte | (VEX_R << 7));
     return;
   }
 
   // 3 byte VEX prefix
-  MCE.emitByte(XOP ? 0x8F : 0xC4);
+  MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4);
   MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
   MCE.emitByte(LastByte | (VEX_W << 7));
 }
@@ -1146,8 +1045,10 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
   uint64_t TSFlags = Desc->TSFlags;
 
-  // Is this instruction encoded using the AVX VEX prefix?
-  bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+  // Encoding type for this instruction.
+  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
+                           X86II::EncodingShift;
+
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
@@ -1158,7 +1059,35 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
 
-  if (!HasVEXPrefix)
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK)
+    MCE.emitByte(0xF0);
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemoryOperand, MI);
+
+  // Emit the repeat opcode prefix as needed.
+  if (Desc->TSFlags & X86II::REP)
+    MCE.emitByte(0xF3);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemoryOperand < 0) {
+    need_address_override = false;
+  } else if (Is64BitMode) {
+    assert(!Is16BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+  } else {
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is16BitMemOperand(MI, MemoryOperand);
+  }
+
+  if (need_address_override)
+    MCE.emitByte(0x67);
+
+  if (Encoding == 0)
     emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
   else
     emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
@@ -1186,7 +1115,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       if (MI.getOperand(0).getSymbolName()[0])
         report_fatal_error("JIT does not support inline asm!");
       break;
-    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::CFI_INSTRUCTION:
+      break;
     case TargetOpcode::GC_LABEL:
     case TargetOpcode::EH_LABEL:
       MCE.emitLabel(MI.getOperand(0).getMCSymbol());
@@ -1353,6 +1283,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     break;
   }
 
+  case X86II::MRMXr:
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
@@ -1360,8 +1291,9 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     MCE.emitByte(BaseOpcode);
+    uint64_t Form = (Desc->TSFlags & X86II::FormMask);
     emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
-                     (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r);
 
     if (CurOp == NumOps)
       break;
@@ -1390,6 +1322,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     break;
   }
 
+  case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
@@ -1401,7 +1334,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
           X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
 
     MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+    uint64_t Form = (Desc->TSFlags & X86II::FormMask);
+    emitMemModRMByte(MI, CurOp, (Form==X86II::MRMXm) ? 0 : Form - X86II::MRM0m,
                      PCAdj);
     CurOp += X86::AddrNumOperands;
 
@@ -1432,41 +1366,81 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     break;
   }
 
-  case X86II::MRMInitReg:
+  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
+  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+  case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
+  case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
+  case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
+  case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
+  case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
+  case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
+  case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
+  case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
     MCE.emitByte(BaseOpcode);
-    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
-    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp).getReg()));
-    ++CurOp;
-    break;
 
-  case X86II::MRM_C1:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xC1);
-    break;
-  case X86II::MRM_C8:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xC8);
-    break;
-  case X86II::MRM_C9:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xC9);
-    break;
-  case X86II::MRM_CA:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xCA);
-    break;
-  case X86II::MRM_CB:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xCB);
-    break;
-  case X86II::MRM_E8:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xE8);
-    break;
-  case X86II::MRM_F0:
-    MCE.emitByte(BaseOpcode);
-    MCE.emitByte(0xF0);
+    unsigned char MRM;
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Invalid Form");
+    case X86II::MRM_C0: MRM = 0xC0; break;
+    case X86II::MRM_C1: MRM = 0xC1; break;
+    case X86II::MRM_C2: MRM = 0xC2; break;
+    case X86II::MRM_C3: MRM = 0xC3; break;
+    case X86II::MRM_C4: MRM = 0xC4; break;
+    case X86II::MRM_C8: MRM = 0xC8; break;
+    case X86II::MRM_C9: MRM = 0xC9; break;
+    case X86II::MRM_CA: MRM = 0xCA; break;
+    case X86II::MRM_CB: MRM = 0xCB; break;
+    case X86II::MRM_D0: MRM = 0xD0; break;
+    case X86II::MRM_D1: MRM = 0xD1; break;
+    case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
+    case X86II::MRM_D6: MRM = 0xD6; break;
+    case X86II::MRM_D8: MRM = 0xD8; break;
+    case X86II::MRM_D9: MRM = 0xD9; break;
+    case X86II::MRM_DA: MRM = 0xDA; break;
+    case X86II::MRM_DB: MRM = 0xDB; break;
+    case X86II::MRM_DC: MRM = 0xDC; break;
+    case X86II::MRM_DD: MRM = 0xDD; break;
+    case X86II::MRM_DE: MRM = 0xDE; break;
+    case X86II::MRM_DF: MRM = 0xDF; break;
+    case X86II::MRM_E0: MRM = 0xE0; break;
+    case X86II::MRM_E1: MRM = 0xE1; break;
+    case X86II::MRM_E2: MRM = 0xE2; break;
+    case X86II::MRM_E3: MRM = 0xE3; break;
+    case X86II::MRM_E4: MRM = 0xE4; break;
+    case X86II::MRM_E5: MRM = 0xE5; break;
+    case X86II::MRM_E8: MRM = 0xE8; break;
+    case X86II::MRM_E9: MRM = 0xE9; break;
+    case X86II::MRM_EA: MRM = 0xEA; break;
+    case X86II::MRM_EB: MRM = 0xEB; break;
+    case X86II::MRM_EC: MRM = 0xEC; break;
+    case X86II::MRM_ED: MRM = 0xED; break;
+    case X86II::MRM_EE: MRM = 0xEE; break;
+    case X86II::MRM_F0: MRM = 0xF0; break;
+    case X86II::MRM_F1: MRM = 0xF1; break;
+    case X86II::MRM_F2: MRM = 0xF2; break;
+    case X86II::MRM_F3: MRM = 0xF3; break;
+    case X86II::MRM_F4: MRM = 0xF4; break;
+    case X86II::MRM_F5: MRM = 0xF5; break;
+    case X86II::MRM_F6: MRM = 0xF6; break;
+    case X86II::MRM_F7: MRM = 0xF7; break;
+    case X86II::MRM_F8: MRM = 0xF8; break;
+    case X86II::MRM_F9: MRM = 0xF9; break;
+    case X86II::MRM_FA: MRM = 0xFA; break;
+    case X86II::MRM_FB: MRM = 0xFB; break;
+    case X86II::MRM_FC: MRM = 0xFC; break;
+    case X86II::MRM_FD: MRM = 0xFD; break;
+    case X86II::MRM_FE: MRM = 0xFE; break;
+    case X86II::MRM_FF: MRM = 0xFF; break;
+    }
+    MCE.emitByte(MRM);
     break;
   }
 
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 97f96ab..1aab1ea 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -15,8 +15,8 @@
 
 #include "X86.h"
 #include "X86CallingConv.h"
-#include "X86ISelLowering.h"
 #include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
@@ -26,22 +26,22 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 namespace {
 
-class X86FastISel : public FastISel {
+class X86FastISel final : public FastISel {
   /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
@@ -62,16 +62,16 @@ public:
     X86ScalarSSEf32 = Subtarget->hasSSE1();
   }
 
-  virtual bool TargetSelectInstruction(const Instruction *I);
+  bool TargetSelectInstruction(const Instruction *I) override;
 
   /// \brief The specified machine instr operand is a vreg, and that
   /// vreg is being provided by the specified load instruction.  If possible,
   /// try to fold the load as an operand to the instruction, returning true if
   /// possible.
-  virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
-                                   const LoadInst *LI);
+  bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                           const LoadInst *LI) override;
 
-  virtual bool FastLowerArguments();
+  bool FastLowerArguments() override;
 
 #include "X86GenFastISel.inc"
 
@@ -128,11 +128,11 @@ private:
 
   bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
 
-  unsigned TargetMaterializeConstant(const Constant *C);
+  unsigned TargetMaterializeConstant(const Constant *C) override;
 
-  unsigned TargetMaterializeAlloca(const AllocaInst *C);
+  unsigned TargetMaterializeAlloca(const AllocaInst *C) override;
 
-  unsigned TargetMaterializeFloatZero(const ConstantFP *CF);
+  unsigned TargetMaterializeFloatZero(const ConstantFP *CF) override;
 
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
@@ -229,7 +229,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 
   ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DL, TII.get(Opc), ResultReg), AM);
+                         DbgLoc, TII.get(Opc), ResultReg), AM);
   return true;
 }
 
@@ -248,7 +248,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
   case MVT::i1: {
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
     ValReg = AndResult;
   }
@@ -289,7 +289,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
   }
 
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DL, TII.get(Opc)), AM).addReg(ValReg);
+                         DbgLoc, TII.get(Opc)), AM).addReg(ValReg);
   return true;
 }
 
@@ -297,7 +297,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
                                    const X86AddressMode &AM, bool Aligned) {
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Val))
-    Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
+    Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
 
   // If this is a store of a simple constant, fold the constant into the store.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
@@ -318,7 +318,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
 
     if (Opc) {
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                             DL, TII.get(Opc)), AM)
+                             DbgLoc, TII.get(Opc)), AM)
                              .addImm(Signed ? (uint64_t) CI->getSExtValue() :
                                               CI->getZExtValue());
       return true;
@@ -363,7 +363,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
     // it works...).
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       if (const GlobalVariable *GVar =
-            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+              dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal()))
         if (GVar->isThreadLocal())
           return false;
 
@@ -428,7 +428,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 
         LoadReg = createResultReg(RC);
         MachineInstrBuilder LoadMI =
-          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
         addFullAddress(LoadMI, StubAM);
 
         // Ok, back to normal mode.
@@ -547,14 +547,14 @@ redo_gep:
          i != e; ++i, ++GTI) {
       const Value *Op = *i;
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = TD.getStructLayout(STy);
+        const StructLayout *SL = DL.getStructLayout(STy);
         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
         continue;
       }
 
       // A array/variable index is always of the form i*S where S is the
       // constant scale size.  See if we can push the scale into immediates.
-      uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
       for (;;) {
         if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
           // Constant-offset addressing.
@@ -696,8 +696,8 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
         (AM.Base.Reg != 0 || AM.IndexReg != 0))
       return false;
 
-    // Can't handle DLLImport.
-    if (GV->hasDLLImportLinkage())
+    // Can't handle DbgLocLImport.
+    if (GV->hasDLLImportStorageClass())
       return false;
 
     // Can't handle TLS.
@@ -750,7 +750,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
     return false;
 
   unsigned SABIAlignment =
-    TD.getABITypeAlignment(S->getValueOperand()->getType());
+    DL.getABITypeAlignment(S->getValueOperand()->getType());
   bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
 
   MVT VT;
@@ -864,7 +864,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     // Avoid a cross-class copy. This is very unlikely.
     if (!SrcRC->contains(DstReg))
       return false;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
             DstReg).addReg(SrcReg);
 
     // Add register to return instruction.
@@ -876,19 +876,19 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   // a virtual register in the entry block, so now we copy the value out
   // and into %rax. We also do the same with %eax for Win32.
   if (F.hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
     unsigned Reg = X86MFInfo->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
     unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
             RetReg).addReg(Reg);
     RetRegs.push_back(RetReg);
   }
 
   // Now emit the RET.
   MachineInstrBuilder MIB =
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
@@ -961,14 +961,14 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
 
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Op1))
-    Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
+    Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
 
   // We have two options: compare with register or immediate.  If the RHS of
   // the compare is an immediate that we can fold into this compare, use
   // CMPri, otherwise use CMPrr.
   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareImmOpc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc))
         .addReg(Op0Reg)
         .addImm(Op1C->getSExtValue());
       return true;
@@ -980,7 +980,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
 
   unsigned Op1Reg = getRegForValue(Op1);
   if (Op1Reg == 0) return false;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc))
     .addReg(Op0Reg)
     .addReg(Op1Reg);
 
@@ -1004,10 +1004,10 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
     unsigned EReg = createResultReg(&X86::GR8RegClass);
     unsigned NPReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(X86::SETNPr), NPReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
     UpdateValueMap(I, ResultReg);
     return true;
@@ -1018,9 +1018,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
     unsigned NEReg = createResultReg(&X86::GR8RegClass);
     unsigned PReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg)
       .addReg(PReg).addReg(NEReg);
     UpdateValueMap(I, ResultReg);
     return true;
@@ -1060,7 +1060,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
   if (!X86FastEmitCompare(Op0, Op1, VT))
     return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1097,11 +1097,11 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     }
 
     unsigned Result32 = createResultReg(&X86::GR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovInst), Result32)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
       .addReg(ResultReg);
 
     ResultReg = createResultReg(&X86::GR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::SUBREG_TO_REG),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
             ResultReg)
       .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
   } else if (DstVT != MVT::i8) {
@@ -1181,17 +1181,17 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       if (!X86FastEmitCompare(Op0, Op1, VT))
         return false;
 
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
         .addMBB(TrueMBB);
 
       if (Predicate == CmpInst::FCMP_UNE) {
         // X86 requires a second branch to handle UNE (and OEQ,
         // which is mapped to UNE above).
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JP_4))
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4))
           .addMBB(TrueMBB);
       }
 
-      FastEmitBranch(FalseMBB, DL);
+      FastEmitBranch(FalseMBB, DbgLoc);
       FuncInfo.MBB->addSuccessor(TrueMBB);
       return true;
     }
@@ -1212,7 +1212,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       if (TestOpc) {
         unsigned OpReg = getRegForValue(TI->getOperand(0));
         if (OpReg == 0) return false;
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc))
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
           .addReg(OpReg).addImm(1);
 
         unsigned JmpOpc = X86::JNE_4;
@@ -1221,9 +1221,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
           JmpOpc = X86::JE_4;
         }
 
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc))
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
           .addMBB(TrueMBB);
-        FastEmitBranch(FalseMBB, DL);
+        FastEmitBranch(FalseMBB, DbgLoc);
         FuncInfo.MBB->addSuccessor(TrueMBB);
         return true;
       }
@@ -1236,11 +1236,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   unsigned OpReg = getRegForValue(BI->getCondition());
   if (OpReg == 0) return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
     .addReg(OpReg).addImm(1);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
     .addMBB(TrueMBB);
-  FastEmitBranch(FalseMBB, DL);
+  FastEmitBranch(FalseMBB, DbgLoc);
   FuncInfo.MBB->addSuccessor(TrueMBB);
   return true;
 }
@@ -1297,18 +1297,18 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
 
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
           CReg).addReg(Op1Reg);
 
   // The shift instruction uses X86::CL. If we defined a super-register
   // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
   if (CReg != X86::CL)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::KILL), X86::CL)
       .addReg(CReg, RegState::Kill);
 
   unsigned ResultReg = createResultReg(RC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpReg), ResultReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
     .addReg(Op0Reg);
   UpdateValueMap(I, ResultReg);
   return true;
@@ -1409,38 +1409,38 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     return false;
 
   // Move op0 into low-order input register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
   // Zero-extend or sign-extend into high-order input register.
   if (OpEntry.OpSignExtend) {
     if (OpEntry.IsOpSigned)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(OpEntry.OpSignExtend));
     else {
       unsigned Zero32 = createResultReg(&X86::GR32RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(X86::MOV32r0), Zero32);
 
       // Copy the zero into the appropriate sub/super/identical physical
       // register. Unfortunately the operations needed are not uniform enough to
       // fit neatly into the table above.
       if (VT.SimpleTy == MVT::i16) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
           .addReg(Zero32, 0, X86::sub_16bit);
       } else if (VT.SimpleTy == MVT::i32) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
             .addReg(Zero32);
       } else if (VT.SimpleTy == MVT::i64) {
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
             .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
       }
     }
   }
   // Generate the DIV/IDIV instruction.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
   // For i8 remainder, we can't reference AH directly, as we'll end
   // up with bogus copies like %R9B = COPY %AH. Reference AX
@@ -1456,11 +1456,11 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
       OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
     unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
     unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Copy), SourceSuperReg).addReg(X86::AX);
 
     // Shift AX right by 8 bits instead of using AH.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
             ResultSuperReg).addReg(SourceSuperReg).addImm(8);
 
     // Now reference the 8-bit subreg of the result.
@@ -1470,7 +1470,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
   // Copy the result out of the physreg if we haven't already.
   if (!ResultReg) {
     ResultReg = createResultReg(TypeEntry.RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
         .addReg(OpEntry.DivRemResultReg);
   }
   UpdateValueMap(I, ResultReg);
@@ -1508,10 +1508,15 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
   unsigned Op2Reg = getRegForValue(I->getOperand(2));
   if (Op2Reg == 0) return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
-    .addReg(Op0Reg).addReg(Op0Reg);
+  // Selects operate on i1, however, Op0Reg is 8 bits width and may contain
+  // garbage. Indeed, only the less significant bit is supposed to be accurate.
+  // If we read more than the lsb, we may see non-zero values whereas lsb
+  // is zero. Therefore, we have to truncate Op0Reg to i1 for the select.
+  // This is achieved by performing TEST against 1.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+    .addReg(Op0Reg).addImm(1);
   unsigned ResultReg = createResultReg(RC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
     .addReg(Op1Reg).addReg(Op2Reg);
   UpdateValueMap(I, ResultReg);
   return true;
@@ -1526,7 +1531,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
       unsigned OpReg = getRegForValue(V);
       if (OpReg == 0) return false;
       unsigned ResultReg = createResultReg(&X86::FR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(X86::CVTSS2SDrr), ResultReg)
         .addReg(OpReg);
       UpdateValueMap(I, ResultReg);
@@ -1545,7 +1550,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
         unsigned OpReg = getRegForValue(V);
         if (OpReg == 0) return false;
         unsigned ResultReg = createResultReg(&X86::FR32RegClass);
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(X86::CVTSD2SSrr), ResultReg)
           .addReg(OpReg);
         UpdateValueMap(I, ResultReg);
@@ -1585,7 +1590,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
       (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
       (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
     unsigned CopyReg = createResultReg(CopyRC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
             CopyReg).addReg(InputReg);
     InputReg = CopyReg;
   }
@@ -1696,6 +1701,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     const Value *Op1 = I.getArgOperand(0); // The guard's value.
     const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
+    MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
     // Grab the frame index.
     X86AddressMode AM;
     if (!X86SelectAddress(Slot, AM)) return false;
@@ -1711,12 +1718,12 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
     // FIXME may need to add RegState::Debug to any registers produced,
     // although ESP/EBP should be the only ones at the moment.
-    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II), AM).
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM).
       addImm(0).addMetadata(DI->getVariable());
     return true;
   }
   case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TRAP));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
     return true;
   }
   case Intrinsic::sadd_with_overflow:
@@ -1753,13 +1760,14 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     // The call to CreateRegs builds two sequential registers, to store the
     // both the returned values.
     unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg)
       .addReg(Reg1).addReg(Reg2);
 
     unsigned Opc = X86::SETBr;
     if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
       Opc = X86::SETOr;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+            ResultReg + 1);
 
     UpdateValueMap(&I, ResultReg, 2);
     return true;
@@ -1833,7 +1841,8 @@ bool X86FastISel::FastLowerArguments() {
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
     unsigned ResultReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
     UpdateValueMap(I, ResultReg);
   }
@@ -1863,7 +1872,7 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
                                            const ImmutableCallSite &CS) {
   if (Subtarget.is64Bit())
     return 0;
-  if (Subtarget.isTargetWindows())
+  if (Subtarget.getTargetTriple().isOSMSVCRT())
     return 0;
   CallingConv::ID CC = CS.getCallingConv();
   if (CC == CallingConv::Fast || CC == CallingConv::GHC)
@@ -1903,6 +1912,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (isVarArg && isWin64)
     return false;
 
+  // Don't know about inalloca yet.
+  if (CS.hasInAllocaArgument())
+    return false;
+
   // Fast-isel doesn't know about callee-pop yet.
   if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg,
                        TM.Options.GuaranteedTailCallOpt))
@@ -1958,7 +1971,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
       PointerType *Ty = cast<PointerType>(ArgVal->getType());
       Type *ElementTy = Ty->getElementType();
-      unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
+      unsigned FrameSize = DL.getTypeAllocSize(ElementTy);
       unsigned FrameAlign = CS.getParamAlignment(AttrInd);
       if (!FrameAlign)
         FrameAlign = TLI.getByValTypeAlignment(ElementTy);
@@ -2015,7 +2028,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       return false;
     if (ArgVT == MVT::x86mmx)
       return false;
-    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
     Args.push_back(ArgReg);
@@ -2040,7 +2053,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackDown))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
     .addImm(NumBytes);
 
   // Process argument: walk the register/memloc assignments, inserting
@@ -2104,11 +2117,13 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
       // support this.
       return false;
+    case CCValAssign::FPExt:
+      llvm_unreachable("Unexpected loc info!");
     }
 
     if (VA.isRegLoc()) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              VA.getLocReg()).addReg(Arg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
       RegArgs.push_back(VA.getLocReg());
     } else {
       unsigned LocMemOffset = VA.getLocMemOffset();
@@ -2142,8 +2157,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   // GOT pointer.
   if (Subtarget->isPICStyleGOT()) {
     unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-            X86::EBX).addReg(Base);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
   }
 
   if (Subtarget->is64Bit() && isVarArg && !isWin64) {
@@ -2153,7 +2168,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
             X86::AL).addImm(NumXMMRegs);
   }
 
@@ -2166,7 +2181,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       CallOpc = X86::CALL64r;
     else
       CallOpc = X86::CALL32r;
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
       .addReg(CalleeOp);
 
   } else {
@@ -2200,7 +2215,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     }
 
 
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc));
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
     if (MemIntName)
       MIB.addExternalSymbol(MemIntName, OpFlags);
     else
@@ -2225,7 +2240,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(NumBytesCallee);
 
   // Build info for return calling conv lowering code.
@@ -2271,10 +2286,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
         CopyVT = MVT::f80;
         CopyReg = createResultReg(&X86::RFP80RegClass);
       }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
-              CopyReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::FpPOP_RETVAL), CopyReg);
     } else {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY),
               CopyReg).addReg(RVLocs[i].getLocReg());
       UsedRegs.push_back(RVLocs[i].getLocReg());
     }
@@ -2287,11 +2303,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize, false);
-      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(Opc)), FI)
         .addReg(CopyReg);
       Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
-      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(Opc), ResultReg + i), FI);
     }
   }
@@ -2426,7 +2442,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
 
       Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
-      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
       return ResultReg;
     }
@@ -2434,10 +2450,10 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   }
 
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  unsigned Align = DL.getPrefTypeAlignment(C->getType());
   if (Align == 0) {
     // Alignment of vector types.  FIXME!
-    Align = TD.getTypeAllocSize(C->getType());
+    Align = DL.getTypeAllocSize(C->getType());
   }
 
   // x86-32 PIC requires a PIC base register for constant pools.
@@ -2457,7 +2473,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   // Create the load from the constant pool.
   unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
   unsigned ResultReg = createResultReg(RC);
-  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                    TII.get(Opc), ResultReg),
                            MCPOffset, PICBase, OpFlag);
 
@@ -2474,6 +2490,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
   // X86SelectAddrss, and TargetMaterializeAlloca.
   if (!FuncInfo.StaticAllocaMap.count(C))
     return 0;
+  assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
 
   X86AddressMode AM;
   if (!X86SelectAddress(C, AM))
@@ -2481,7 +2498,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
   unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
   const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
   unsigned ResultReg = createResultReg(RC);
-  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                          TII.get(Opc), ResultReg), AM);
   return ResultReg;
 }
@@ -2520,7 +2537,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
   }
 
   unsigned ResultReg = createResultReg(RC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   return ResultReg;
 }
 
@@ -2533,7 +2550,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   const X86InstrInfo &XII = (const X86InstrInfo&)TII;
 
-  unsigned Size = TD.getTypeAllocSize(LI->getType());
+  unsigned Size = DL.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();
 
   SmallVector<MachineOperand, 8> AddrOps;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 38a8351..c2c234b 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -39,7 +39,7 @@ namespace {
     /// where appropriate.
     bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-    virtual const char *getPassName() const { return "X86 Atom LEA Fixup";}
+    const char *getPassName() const override { return "X86 Atom LEA Fixup";}
 
     /// \brief Given a machine register, look for the instruction
     /// which writes it in the current basic block. If found,
@@ -80,7 +80,7 @@ namespace {
     /// \brief Loop over all of the basic blocks,
     /// replacing instructions by equivalent LEA instructions
     /// if needed and when possible.
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
   private:
     MachineFunction *MF;
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 48470da..7955ade 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -59,7 +59,7 @@ namespace {
       memset(RegMap, 0, sizeof(RegMap));
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<EdgeBundles>();
       AU.addPreservedID(MachineLoopInfoID);
@@ -67,9 +67,9 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+    const char *getPassName() const override { return "X86 FP Stackifier"; }
 
   private:
     const TargetInstrInfo *TII; // Machine instruction info.
@@ -432,7 +432,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
 
     MachineInstr *PrevMI = 0;
     if (I != BB.begin())
-      PrevMI = prior(I);
+      PrevMI = std::prev(I);
 
     ++NumFP;  // Keep track of # of pseudo instrs
     DEBUG(dbgs() << "\nFPInst:\t" << *MI);
@@ -475,10 +475,10 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       } else {
         MachineBasicBlock::iterator Start = I;
         // Rewind to first instruction newly inserted.
-        while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+        while (Start != BB.begin() && std::prev(Start) != PrevI) --Start;
         dbgs() << "Inserted instructions:\n\t";
         Start->print(dbgs(), &MF.getTarget());
-        while (++Start != llvm::next(I)) {}
+        while (++Start != std::next(I)) {}
       }
       dumpStack();
     );
@@ -905,7 +905,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Kill registers by popping.
   if (Kills && I != MBB->begin()) {
-    MachineBasicBlock::iterator I2 = llvm::prior(I);
+    MachineBasicBlock::iterator I2 = std::prev(I);
     while (StackTop) {
       unsigned KReg = getStackEntry(0);
       if (!(Kills & (1 << KReg)))
@@ -1671,8 +1671,10 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     break;
   }
 
-  case X86::RET:
-  case X86::RETI:
+  case X86::RETQ:
+  case X86::RETL:
+  case X86::RETIL:
+  case X86::RETIQ:
     // If RET has an FP register use operand, pass the first one in ST(0) and
     // the second one in ST(1).
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index a06ba9d..f0ad4d1 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -50,7 +50,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken() || MF.hasMSInlineAsm() ||
+          MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MMI.callsUnwindInit() || MMI.callsEHReturn());
 }
@@ -107,8 +107,10 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
   unsigned Opc = MBBI->getOpcode();
   switch (Opc) {
   default: return 0;
-  case X86::RET:
-  case X86::RETI:
+  case X86::RETL:
+  case X86::RETQ:
+  case X86::RETIL:
+  case X86::RETIQ:
   case X86::TCRETURNdi:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
@@ -205,7 +207,7 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                       unsigned StackPtr, uint64_t *NumBytes = NULL) {
   if (MBBI == MBB.begin()) return;
 
-  MachineBasicBlock::iterator PI = prior(MBBI);
+  MachineBasicBlock::iterator PI = std::prev(MBBI);
   unsigned Opc = PI->getOpcode();
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
@@ -233,7 +235,7 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
 
   if (MBBI == MBB.end()) return;
 
-  MachineBasicBlock::iterator NI = llvm::next(MBBI);
+  MachineBasicBlock::iterator NI = std::next(MBBI);
   if (NI == MBB.end()) return;
 
   unsigned Opc = NI->getOpcode();
@@ -266,8 +268,8 @@ static int mergeSPUpdates(MachineBasicBlock &MBB,
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
 
-  MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
-  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI);
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : std::next(MBBI);
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
@@ -302,12 +304,14 @@ static bool isEAXLiveIn(MachineFunction &MF) {
   return false;
 }
 
-void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
-                                                 MCSymbol *Label,
-                                                 unsigned FramePtr) const {
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
+    unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const X86InstrInfo &TII = *TM.getInstrInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -361,7 +365,9 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
       continue;
 
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset));
+    unsigned CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createOffset(0, DwarfReg, Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
   }
 }
 
@@ -373,8 +379,9 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
 static bool usesTheStack(const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
-       re = MRI.reg_end(); ri != re; ++ri)
+  for (MachineRegisterInfo::reg_instr_iterator
+       ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
+       ri != re; ++ri)
     if (ri->isCopy())
       return true;
 
@@ -501,19 +508,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     if (needsFrameMoves) {
       // Mark the place where EBP/RBP was saved.
-      MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
-        .addSym(FrameLabel);
-
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, 2 * stackGrowth));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(0, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
-      MMI.addFrameInst(MCCFIInstruction::createOffset(FrameLabel, DwarfFramePtr,
-                                                      2 * stackGrowth));
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(0, DwarfFramePtr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
 
     // Update EBP with the new base value.
@@ -524,18 +531,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     if (needsFrameMoves) {
       // Mark effective beginning of when frame pointer becomes valid.
-      MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
-        .addSym(FrameLabel);
-
       // Define the current CFA to use the EBP/RBP register.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
-      MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaRegister(FrameLabel, DwarfFramePtr));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(0, DwarfFramePtr));
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
 
     // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+    for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(FramePtr);
   } else {
@@ -555,13 +560,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     if (!HasFP && needsFrameMoves) {
       // Mark callee-saved push instruction.
-      MCSymbol *Label = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
-
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(Label, StackOffset));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
       StackOffset += stackGrowth;
     }
   }
@@ -606,16 +610,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetEnvMacho()) {
+  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) {
     const char *StackProbeSymbol;
-    bool isSPUpdateNeeded = false;
 
     if (Is64Bit) {
-      if (STI.isTargetCygMing())
-        StackProbeSymbol = "___chkstk";
-      else {
+      if (STI.isTargetCygMing()) {
+        StackProbeSymbol = "___chkstk_ms";
+      } else {
         StackProbeSymbol = "__chkstk";
-        isSPUpdateNeeded = true;
       }
     } else if (STI.isTargetCygMing())
       StackProbeSymbol = "_alloca";
@@ -657,15 +659,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
       .setMIFlag(MachineInstr::FrameSetup);
 
-    // MSVC x64's __chkstk does not adjust %rsp itself.
-    // It also does not clobber %rax so we can reuse it when adjusting %rsp.
-    if (isSPUpdateNeeded) {
+    if (Is64Bit) {
+      // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+      // themself. It also does not clobber %rax so we can reuse it when
+      // adjusting %rsp.
       BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
         .addReg(StackPtr)
         .addReg(X86::RAX)
         .setMIFlag(MachineInstr::FrameSetup);
     }
-
     if (isEAXAlive) {
         // Restore EAX
         MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
@@ -692,20 +694,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
   if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
-    MCSymbol *Label = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
-      .addSym(Label);
-
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
-          Label, -StackSize + stackGrowth));
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(0, -StackSize + stackGrowth));
+
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
     if (PushedRegs)
-      emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
+      emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr);
   }
 }
 
@@ -730,8 +731,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   switch (RetOpcode) {
   default:
     llvm_unreachable("Can only insert epilog into returning blocks");
-  case X86::RET:
-  case X86::RETI:
+  case X86::RETQ:
+  case X86::RETL:
+  case X86::RETIL:
+  case X86::RETIQ:
   case X86::TCRETURNdi:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
@@ -781,7 +784,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Skip the callee-saved pop instructions.
   while (MBBI != MBB.begin()) {
-    MachineBasicBlock::iterator PI = prior(MBBI);
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
     unsigned Opc = PI->getOpcode();
 
     if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
@@ -883,12 +886,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
         addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
-    MachineInstr *NewMI = prior(MBBI);
+    MachineInstr *NewMI = std::prev(MBBI);
     NewMI->copyImplicitOps(MF, MBBI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
-  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+  } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL ||
+              RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) &&
              (X86FI->getTCReturnAddrDelta() < 0)) {
     // Add the return addr area delta back since we are not tail calling.
     int delta = -1*X86FI->getTCReturnAddrDelta();
@@ -1163,7 +1167,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
   if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
-      !STI.isTargetWin32() && !STI.isTargetFreeBSD())
+      !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
@@ -1207,6 +1211,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
       TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+    } else if (STI.isTargetWin64()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x28; // pvArbitrary, reserved for application use
     } else if (STI.isTargetFreeBSD()) {
       TlsReg = X86::FS;
       TlsOffset = 0x18;
@@ -1244,7 +1251,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
-    if (STI.isTargetLinux() || STI.isTargetWin32()) {
+    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64()) {
       BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
@@ -1552,7 +1559,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // sure we restore the stack pointer immediately after the call, there may
     // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
     MachineBasicBlock::iterator B = MBB.begin();
-    while (I != B && !llvm::prior(I)->isCall())
+    while (I != B && !std::prev(I)->isCall())
       --I;
     MBB.insert(I, New);
   }
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 3d3b011..f0db8cb 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -15,7 +15,6 @@
 #define X86_FRAMELOWERING_H
 
 #include "X86Subtarget.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -34,41 +33,42 @@ public:
       TM(tm), STI(sti) {
   }
 
-  void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label,
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
                                  unsigned FramePtr) const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void adjustForSegmentedStacks(MachineFunction &MF) const;
+  void adjustForSegmentedStacks(MachineFunction &MF) const override;
 
-  void adjustForHiPEPrologue(MachineFunction &MF) const;
+  void adjustForHiPEPrologue(MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
+                                        RegScavenger *RS = NULL) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const;
+                             unsigned &FrameReg) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI) const;
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 36d1690..3e45adb 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -141,7 +141,7 @@ namespace {
   /// ISel - X86 specific code to select X86 machine instructions for
   /// SelectionDAG operations.
   ///
-  class X86DAGToDAGISel : public SelectionDAGISel {
+  class X86DAGToDAGISel final : public SelectionDAGISel {
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
@@ -156,15 +156,15 @@ namespace {
         Subtarget(&tm.getSubtarget<X86Subtarget>()),
         OptForSize(false) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "X86 DAG->DAG Instruction Selection";
     }
 
-    virtual void EmitFunctionEntryCode();
+    void EmitFunctionEntryCode() override;
 
-    virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
+    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 
-    virtual void PreprocessISelDAG();
+    void PreprocessISelDAG() override;
 
     inline bool immSext8(SDNode *N) const {
       return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
@@ -181,7 +181,7 @@ namespace {
 #include "X86GenDAGISel.inc"
 
   private:
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
     SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
@@ -219,9 +219,9 @@ namespace {
 
     /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
     /// inline asm expressions.
-    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                              char ConstraintCode,
-                                              std::vector<SDValue> &OutOps);
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
 
     void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
 
@@ -344,7 +344,7 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
       // addl    %gs:0, %eax
       // if the block also has an access to a second TLS address this will save
       // a load.
-      // FIXME: This is probably also true for non TLS addresses.
+      // FIXME: This is probably also true for non-TLS addresses.
       if (Op1.getOpcode() == X86ISD::Wrapper) {
         SDValue Val = Op1.getOperand(0);
         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 081c558..2a35061 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15,9 +15,9 @@
 #define DEBUG_TYPE "x86-isel"
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
-#include "X86.h"
 #include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/ADT/SmallSet.h"
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -39,12 +40,10 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -180,7 +179,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   bool is64Bit = Subtarget->is64Bit();
 
-  if (Subtarget->isTargetEnvMacho()) {
+  if (Subtarget->isTargetMacho()) {
     if (is64Bit)
       return new X86_64MachoTargetObjectFile();
     return new TargetLoweringObjectFileMachO();
@@ -190,7 +189,9 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
     return new X86LinuxTargetObjectFile();
   if (Subtarget->isTargetELF())
     return new TargetLoweringObjectFileELF();
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isTargetKnownWindowsMSVC())
+    return new X86WindowsTargetObjectFile();
+  if (Subtarget->isTargetCOFF())
     return new TargetLoweringObjectFileCOFF();
   llvm_unreachable("unknown subtarget type");
 }
@@ -249,7 +250,7 @@ void X86TargetLowering::resetOperationActions() {
       addBypassSlowDiv(64, 16);
   }
 
-  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
     // Setup Windows compiler runtime calls.
     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
@@ -274,7 +275,7 @@ void X86TargetLowering::resetOperationActions() {
     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
     setUseUnderscoreSetJmp(false);
     setUseUnderscoreLongJmp(false);
-  } else if (Subtarget->isTargetMingw()) {
+  } else if (Subtarget->isTargetWindowsGNU()) {
     // MS runtime is weird: it exports _setjmp, but longjmp!
     setUseUnderscoreSetJmp(true);
     setUseUnderscoreLongJmp(false);
@@ -504,7 +505,9 @@ void X86TargetLowering::resetOperationActions() {
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
-  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
+
+  if (!Subtarget->hasMOVBE())
+    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 
   // These should be promoted to a larger select which is supported.
   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
@@ -632,7 +635,7 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
   else if (TM.Options.EnableSegmentedStacks)
@@ -1151,9 +1154,12 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
-
+    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+    // even though v8i16 is a legal type.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
@@ -1306,9 +1312,15 @@ void X86TargetLowering::resetOperationActions() {
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
 
+    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
+    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
+    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
+    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
+    setOperationAction(ISD::AND,                MVT::i1,    Legal);
     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
@@ -1352,11 +1364,12 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
 
-    setOperationAction(ISD::TRUNCATE,           MVT::i1, Legal);
+    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
@@ -1370,12 +1383,15 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
 
     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
 
     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
 
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
@@ -1548,14 +1564,13 @@ void X86TargetLowering::resetOperationActions() {
 
 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
-    return MVT::i8;
+    return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
 
-  const TargetMachine &TM = getTargetMachine();
-  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512())
+  if (Subtarget->hasAVX512())
     switch(VT.getVectorNumElements()) {
     case  8: return MVT::v8i1;
     case 16: return MVT::v16i1;
-    }
+  }
 
   return VT.changeVectorElementTypeToInteger();
 }
@@ -1661,7 +1676,9 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
 }
 
 bool
-X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                 unsigned,
+                                                 bool *Fast) const {
   if (Fast)
     *Fast = Subtarget->isUnalignedMemAccessFast();
   return true;
@@ -1832,6 +1849,9 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     else if (VA.getLocInfo() == CCValAssign::BCvt)
       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
 
+    assert(VA.getLocInfo() != CCValAssign::FPExt &&
+           "Unexpected FP-extend for return value.");  
+
     // If this is x86-64, and we disabled SSE, we can't return FP values,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
@@ -1886,7 +1906,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   // We saved the argument into a virtual register in the entry block,
   // so now we copy the value out and into %rax/%eax.
   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
     MachineFunction &MF = DAG.getMachineFunction();
     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
     unsigned Reg = FuncInfo->getSRetReturnReg();
@@ -2175,7 +2195,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   MachineFrameInfo *MFI = MF.getFrameInfo();
   bool Is64Bit = Subtarget->is64Bit();
-  bool IsWindows = Subtarget->isTargetWindows();
   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
 
   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
@@ -2222,6 +2241,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         RC = &X86::VR128RegClass;
       else if (RegVT == MVT::x86mmx)
         RC = &X86::VR64RegClass;
+      else if (RegVT == MVT::i1)
+        RC = &X86::VK1RegClass;
       else if (RegVT == MVT::v8i1)
         RC = &X86::VK8RegClass;
       else if (RegVT == MVT::v16i1)
@@ -2270,7 +2291,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   // Save the argument into a virtual register so that we can access it
   // from the return points.
   if (MF.getFunction()->hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
     unsigned Reg = FuncInfo->getSRetReturnReg();
     if (!Reg) {
@@ -2420,7 +2441,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+    if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+        !Subtarget->getTargetTriple().isOSMSVCRT() &&
         argsAreStructReturn(Ins) == StackStructReturn)
       FuncInfo->setBytesToPopOnReturn(4);
   }
@@ -2509,7 +2531,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
-  bool IsWindows      = Subtarget->isTargetWindows();
   StructReturnType SR = callIsStructReturn(Outs);
   bool IsSibcall      = false;
 
@@ -2570,9 +2591,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       X86Info->setTCReturnAddrDelta(FPDiff);
   }
 
+  unsigned NumBytesToPush = NumBytes;
+  unsigned NumBytesToPop = NumBytes;
+
+  // If we have an inalloca argument, all stack space has already been allocated
+  // for us and be right at the top of the stack.  We don't support multiple
+  // arguments passed in memory when using inalloca.
+  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+    NumBytesToPush = 0;
+    assert(ArgLocs.back().getLocMemOffset() == 0 &&
+           "an inalloca argument must be the only memory argument");
+  }
+
   if (!IsSibcall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                                 dl);
+    Chain = DAG.getCALLSEQ_START(
+        Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
@@ -2589,10 +2622,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // Skip inalloca arguments, they have already been written.
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (Flags.isInAlloca())
+      continue;
+
     CCValAssign &VA = ArgLocs[i];
     EVT RegVT = VA.getLocVT();
     SDValue Arg = OutVals[i];
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
     bool isByVal = Flags.isByVal();
 
     // Promote the value if needed.
@@ -2787,7 +2824,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // We should use extra load for direct calls to dllimported functions in
     // non-JIT mode.
     const GlobalValue *GV = G->getGlobal();
-    if (!GV->hasDLLImportLinkage()) {
+    if (!GV->hasDLLImportStorageClass()) {
       unsigned char OpFlags = 0;
       bool ExtraLoad = false;
       unsigned WrapperKind = ISD::DELETED_NODE;
@@ -2859,8 +2896,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> Ops;
 
   if (!IsSibcall && isTailCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                           DAG.getIntPtrConstant(0, true), InFlag, dl);
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -2899,25 +2937,26 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
-  unsigned NumBytesForCalleeToPush;
+  unsigned NumBytesForCalleeToPop;
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        getTargetMachine().Options.GuaranteedTailCallOpt))
-    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+           !Subtarget->getTargetTriple().isOSMSVCRT() &&
            SR == StackStructReturn)
     // If this is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
     // This is common for Darwin/X86, Linux & Mingw32 targets.
     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
-    NumBytesForCalleeToPush = 4;
+    NumBytesForCalleeToPop = 4;
   else
-    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
+    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
 
   // Returns a flag for retval copy to use.
   if (!IsSibcall) {
     Chain = DAG.getCALLSEQ_END(Chain,
-                               DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+                               DAG.getIntPtrConstant(NumBytesToPop, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPop,
                                                      true),
                                InFlag, dl);
     InFlag = Chain.getValue(1);
@@ -3092,9 +3131,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
-  // An stdcall caller is expected to clean up its arguments; the callee
-  // isn't going to do that.
-  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
+  // An stdcall/thiscall caller is expected to clean up its arguments; the
+  // callee isn't going to do that.
+  // FIXME: this is more restrictive than needed. We could produce a tailcall
+  // when the stack adjustment matches. For example, with a thiscall that takes
+  // only one argument.
+  if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
+                   CallerCC == CallingConv::X86_ThisCall))
     return false;
 
   // Do not sibcall optimize vararg calls unless all arguments are passed via
@@ -3415,6 +3458,24 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
   }
 }
 
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+  switch (X86CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E:     return true;
+  case X86::COND_G:     return false;
+  case X86::COND_GE:    return false;
+  case X86::COND_L:     return false;
+  case X86::COND_LE:    return false;
+  case X86::COND_NE:    return true;
+  case X86::COND_B:     return true;
+  case X86::COND_A:     return true;
+  case X86::COND_BE:    return true;
+  case X86::COND_AE:    return true;
+  }
+  llvm_unreachable("covered switch fell through?!");
+}
+
 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
 /// specific condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
@@ -3533,6 +3594,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   return false;
 }
 
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
 /// the specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -4208,7 +4281,7 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned LaneSize = NumElts/NumLanes;
   // 2 or 4 elements in one lane
-  
+
   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   for (unsigned l = 0; l != NumElts; l += LaneSize) {
     for (unsigned i = 0; i != LaneSize; ++i) {
@@ -4745,6 +4818,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
+  } else if (VT.getScalarType() == MVT::i1) {
+    assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                       Ops, VT.getVectorNumElements());
   } else
     llvm_unreachable("Unexpected vector type");
 
@@ -5727,16 +5807,20 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
   bool AllContants = true;
   uint64_t Immediate = 0;
+  int NonConstIdx = -1;
+  bool IsSplat = true;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (In.getOpcode() == ISD::UNDEF)
       continue;
     if (!isa<ConstantSDNode>(In)) {
       AllContants = false;
-      break;
+      NonConstIdx = idx;
     }
-    if (cast<ConstantSDNode>(In)->getZExtValue())
+    else if (cast<ConstantSDNode>(In)->getZExtValue())
       Immediate |= (1ULL << idx);
+    if (In != Op.getOperand(0))
+      IsSplat = false;
   }
 
   if (AllContants) {
@@ -5746,63 +5830,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getIntPtrConstant(0));
   }
 
-  // Splat vector (with undefs)
-  SDValue In = Op.getOperand(0);
-  for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) {
-    if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF)
-      llvm_unreachable("Unsupported predicate operation");
-  }
-
-  SDValue EFLAGS, X86CC;
-  if (In.getOpcode() == ISD::SETCC) {
-    SDValue Op0 = In.getOperand(0);
-    SDValue Op1 = In.getOperand(1);
-    ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
-    bool isFP = Op1.getValueType().isFloatingPoint();
-    unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
-
-    assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
-
-    X86CC = DAG.getConstant(X86CCVal, MVT::i8);
-    EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
-    EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
-  } else if (In.getOpcode() == X86ISD::SETCC) {
-    X86CC = In.getOperand(0);
-    EFLAGS = In.getOperand(1);
-  } else {
-    // The algorithm:
-    //   Bit1 = In & 0x1
-    //   if (Bit1 != 0)
-    //     ZF = 0
-    //   else
-    //     ZF = 1
-    //   if (ZF == 0)
-    //     res = allOnes ### CMOVNE -1, %res
-    //   else
-    //     res = allZero
-    MVT InVT = In.getSimpleValueType();
-    SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
-    EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
-    X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-  }
-
-  if (VT == MVT::v16i1) {
-    SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
-    SDValue Cst0 = DAG.getConstant(0, MVT::i16);
-    SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
-          Cst0, Cst1, X86CC, EFLAGS);
-    return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
-  }
-
-  if (VT == MVT::v8i1) {
-    SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
-    SDValue Cst0 = DAG.getConstant(0, MVT::i32);
-    SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
-          Cst0, Cst1, X86CC, EFLAGS);
-    CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
-    return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
-  }
-  llvm_unreachable("Unsupported predicate operation");
+  if (!IsSplat && (NonConstIdx != 0))
+    llvm_unreachable("Unsupported BUILD_VECTOR operation");
+  MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
+  SDValue Select;
+  if (IsSplat)
+    Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
+                          DAG.getConstant(-1, SelectVT),
+                          DAG.getConstant(0, SelectVT));
+  else
+    Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
+                         DAG.getConstant((Immediate | 1), SelectVT),
+                         DAG.getConstant(Immediate, SelectVT));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Select);
 }
 
 SDValue
@@ -5995,8 +6035,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
-  if (VT.is256BitVector()) {
-    SmallVector<SDValue, 32> V;
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    SmallVector<SDValue, 64> V;
     for (unsigned i = 0; i != NumElems; ++i)
       V.push_back(Op.getOperand(i));
 
@@ -6008,7 +6048,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                 NumElems/2);
 
     // Recreate the wider vector with the lower and upper part.
-    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    if (VT.is256BitVector())
+      return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   }
 
   // Let legalizer expand 2-wide build_vectors.
@@ -6157,14 +6199,27 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   if(ResVT.is256BitVector())
     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 
+  if (Op.getNumOperands() == 4) {
+    MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+                                ResVT.getVectorNumElements()/2);
+    SDValue V3 = Op.getOperand(2);
+    SDValue V4 = Op.getOperand(3);
+    return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
+      Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+  }
   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getNumOperands() == 2);
+  MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+          Op.getNumOperands() == 4)));
 
-  // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
+  // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
+
+  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
@@ -6228,6 +6283,59 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
 }
 
+/// In vector type \p VT, return true if the element at index \p InputIdx
+/// falls on a different 128-bit lane than \p OutputIdx.
+static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
+                                     unsigned OutputIdx) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
+}
+
+/// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
+/// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
+/// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
+/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
+/// zero.
+static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
+                         SelectionDAG &DAG) {
+  MVT VT = V1.getSimpleValueType();
+  assert(VT.is128BitVector() || VT.is256BitVector());
+
+  MVT EltVT = VT.getVectorElementType();
+  unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 32> PshufbMask;
+  for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
+    int InputIdx = MaskVals[OutputIdx];
+    unsigned InputByteIdx;
+
+    if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
+      InputByteIdx = 0x80;
+    else {
+      // Cross lane is not allowed.
+      if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
+        return SDValue();
+      InputByteIdx = InputIdx * EltSizeInBytes;
+      // Index is an byte offset within the 128-bit lane.
+      InputByteIdx &= 0xf;
+    }
+
+    for (unsigned j = 0; j < EltSizeInBytes; ++j) {
+      PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
+      if (InputByteIdx != 0x80)
+        ++InputByteIdx;
+    }
+  }
+
+  MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
+  if (ShufVT != VT)
+    V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
+  return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
+                                 PshufbMask.data(), PshufbMask.size()));
+}
+
 // v8i16 shuffles - Prefer shuffles in the following order:
 // 1. [all]   pshuflw, pshufhw, optional move
 // 2. [ssse3] 1 x pshufb
@@ -6245,8 +6353,12 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
   // Determine if more than 1 of the words in each of the low and high quadwords
   // of the result come from the same quadword of one of the two inputs.  Undef
   // mask values count as coming from any quadword, for better codegen.
+  //
+  // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
+  // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
   unsigned LoQuad[] = { 0, 0, 0, 0 };
   unsigned HiQuad[] = { 0, 0, 0, 0 };
+  // Indices of quads used.
   std::bitset<4> InputQuads;
   for (unsigned i = 0; i < 8; ++i) {
     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
@@ -6283,7 +6395,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
 
   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
   // of the two input vectors, shuffle them into one input vector so only a
-  // single pshufb instruction is necessary. If There are more than 2 input
+  // single pshufb instruction is necessary. If there are more than 2 input
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
   bool V2Used = InputQuads[2] || InputQuads[3];
@@ -6375,34 +6487,14 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
     // mask, and elements that come from V1 in the V2 mask, so that the two
     // results can be OR'd together.
     bool TwoInputs = V1Used && V2Used;
-    for (unsigned i = 0; i != 8; ++i) {
-      int EltIdx = MaskVals[i] * 2;
-      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
-      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
-      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
-    }
-    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
-    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+    V1 = getPSHUFB(MaskVals, V1, dl, DAG);
     if (!TwoInputs)
       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
 
     // Calculate the shuffle mask for the second input, shuffle it, and
     // OR it with the first shuffled input.
-    pshufbMask.clear();
-    for (unsigned i = 0; i != 8; ++i) {
-      int EltIdx = MaskVals[i] * 2;
-      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
-      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
-      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
-    }
-    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
-    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+    CommuteVectorShuffleMask(MaskVals, 8);
+    V2 = getPSHUFB(MaskVals, V2, dl, DAG);
     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   }
@@ -6484,6 +6576,25 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
   return NewV;
 }
 
+/// \brief v16i16 shuffles
+///
+/// FIXME: We only support generation of a single pshufb currently.  We can
+/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
+/// well (e.g 2 x pshufb + 1 x por).
+static SDValue
+LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+
+  if (V2.getOpcode() != ISD::UNDEF)
+    return SDValue();
+
+  SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+  return getPSHUFB(MaskVals, V1, dl, DAG);
+}
+
 // v16i8 shuffles - Prefer shuffles in the following order:
 // 1. [ssse3] 1 x pshufb
 // 2. [ssse3] 2 x pshufb + 1 x por
@@ -6642,22 +6753,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
     CommuteVectorShuffleMask(MaskVals, 32);
     V1 = V2;
   }
-  SmallVector<SDValue, 32> pshufbMask;
-  for (unsigned i = 0; i != 32; i++) {
-    int EltIdx = MaskVals[i];
-    if (EltIdx < 0 || EltIdx >= 32)
-      EltIdx = 0x80;
-    else {
-      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
-        // Cross lane is not allowed.
-        return SDValue();
-      EltIdx &= 0xf;
-    }
-    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
-  }
-  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
-                      DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                  MVT::v32i8, &pshufbMask[0], 32));
+  return getPSHUFB(MaskVals, V1, dl, DAG);
 }
 
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
@@ -7281,7 +7377,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (V1IsUndef && V2IsUndef)
     return DAG.getUNDEF(VT);
 
-  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return CommuteVectorShuffle(SVOp, DAG);
 
   // Vector shuffle lowering takes 3 steps:
   //
@@ -7450,7 +7550,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     CommuteVectorShuffleMask(M, NumElems);
     std::swap(V1, V2);
     std::swap(V1IsSplat, V2IsSplat);
-    Commuted = false;
 
     if (isUNPCKLMask(M, VT, HasInt256))
       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
@@ -7538,8 +7637,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
       return DAG.getNode(X86ISD::VPERMV, dl, VT,
                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
-    return DAG.getNode(X86ISD::VPERMV3, dl, VT,
-                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
+    return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
   }
 
   //===--------------------------------------------------------------------===//
@@ -7555,6 +7654,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return NewOp;
   }
 
+  if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
   if (VT == MVT::v16i8) {
     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
     if (NewOp.getNode())
@@ -7641,6 +7746,39 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  MVT EltVT = Op.getSimpleValueType();
+
+  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+
+  // variable index can't be handled in mask registers,
+  // extend vector to VR512
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext, Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+                       DAG.getIntPtrConstant(0));
+}
+
 SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
@@ -7648,6 +7786,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Vec = Op.getOperand(0);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
+
+  if (Op.getSimpleValueType() == MVT::i1)
+    return ExtractBitFromMaskVector(Op, DAG);
+
   if (!isa<ConstantSDNode>(Idx)) {
     if (VecVT.is512BitVector() ||
         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
@@ -7657,7 +7799,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
                                     MaskEltVT.getSizeInBits());
-      
+
       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
@@ -8352,7 +8494,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                               Chain.getValue(1));
   }
 
-  if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
+  if (Subtarget->isTargetKnownWindowsMSVC() ||
+      Subtarget->isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
     // Need to generate someting similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -8367,7 +8510,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // If GV is an alias then use the aliasee for determining
     // thread-localness.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->resolveAliasedGlobal(false);
+      GV = GA->getAliasedGlobal();
     SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
@@ -8380,13 +8523,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
-    SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
-      (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
-        DAG.getExternalSymbol("_tls_array", getPointerTy()));
+    SDValue TlsArray =
+        Subtarget->is64Bit()
+            ? DAG.getIntPtrConstant(0x58)
+            : (Subtarget->isTargetWindowsGNU()
+                   ? DAG.getIntPtrConstant(0x2C)
+                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
 
-    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
-                                        MachinePointerInfo(Ptr),
-                                        false, false, false, 0);
+    SDValue ThreadPointer =
+        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
+                    MachinePointerInfo(Ptr), false, false, false, 0);
 
     // Load the _tls_index variable
     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
@@ -8422,9 +8568,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
@@ -8476,12 +8622,12 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
-  EVT SrcVT = Op.getOperand(0).getValueType();
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
 
   if (SrcVT.isVector())
     return SDValue();
 
-  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
   // These are really Legal; return the operand so the caller accepts it as
@@ -8685,15 +8831,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
-  EVT SVT = N0.getValueType();
+  MVT SVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
   assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
           SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
          "Custom UINT_TO_FP is not supported!");
 
-  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                             SVT.getVectorNumElements());
+  MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
   return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
                      DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
 }
@@ -8712,8 +8857,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
 
-  EVT SrcVT = N0.getValueType();
-  EVT DstVT = Op.getValueType();
+  MVT SrcVT = N0.getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
@@ -8908,7 +9053,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
     return SDValue();
 
   if (Subtarget->hasInt256())
-    return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
+    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
 
   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   SDValue Undef = DAG.getUNDEF(InVT);
@@ -8927,9 +9072,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
 
 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
                                         SelectionDAG &DAG) {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
+  MVT InVT = In.getSimpleValueType();
   SDLoc DL(Op);
   unsigned int NumElts = VT.getVectorNumElements();
   if (NumElts != 8 && NumElts != 16)
@@ -8990,9 +9135,21 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  MVT VT = Op.getSimpleValueType();  
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
+
+  if (VT == MVT::i1) {
+    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+           "Invalid scalar TRUNCATE operation");
+    if (InVT == MVT::i32)
+      return SDValue();
+    if (InVT.getSizeInBits() == 64)
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
+    else if (InVT.getSizeInBits() < 32)
+      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+  }
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
          "Invalid TRUNCATE operation");
 
@@ -9008,6 +9165,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
       InVT = ExtVT;
     }
+    
     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
     SDValue CP = DAG.getConstantPool(C, getPointerTy());
@@ -9031,24 +9189,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
                          DAG.getIntPtrConstant(0));
     }
 
-    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(0));
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(2));
-
     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
-
-    // The PSHUFD mask:
-    static const int ShufMask1[] = {0, 2, 0, 0};
-    SDValue Undef = DAG.getUNDEF(VT);
-    OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
-    OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
-
-    // The MOVLHPS mask:
-    static const int ShufMask2[] = {0, 1, 4, 5};
-    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
+    static const int ShufMask[] = {0, 2, 4, 6};
+    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
@@ -9115,8 +9263,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
-  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                             NumElems * 2);
+  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
 
   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   // Prepare truncation shuffle mask
@@ -9131,14 +9278,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  MVT VT = Op.getSimpleValueType();
-  if (VT.isVector()) {
-    if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
-                         DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
-                                     MVT::v8i32, Op.getOperand(0)));
-    return SDValue();
-  }
+  assert(!Op.getSimpleValueType().isVector());
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
@@ -9186,7 +9326,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
                                  In, DAG.getUNDEF(SVT)));
 }
 
-SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
   LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
@@ -9204,7 +9344,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
     C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
                                           APInt(32, ~(1U << 31))));
   C = ConstantVector::getSplat(NumElts, C);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
@@ -9220,7 +9361,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
 }
 
-SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
   LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
@@ -9238,7 +9379,8 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
     C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
                                           APInt(32, 1U << 31)));
   C = ConstantVector::getSplat(NumElts, C);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
@@ -9255,7 +9397,8 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
 }
 
-SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   LLVMContext *Context = DAG.getContext();
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -9291,7 +9434,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   }
   Constant *C = ConstantVector::get(CV);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
@@ -9324,7 +9467,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
     CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   }
   C = ConstantVector::get(CV);
-  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
   SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
@@ -9362,6 +9505,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
 
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, unsigned> VecInMap;
+  SmallVector<SDValue, 8> VecIns;
   EVT VT = MVT::Other;
 
   // Recognize a special case where a vector is casted into wide integer to
@@ -9401,6 +9545,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
           VT != VecInMap.begin()->first.getValueType())
         return SDValue();
       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+      VecIns.push_back(ExtractedFromVec);
     }
     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   }
@@ -9409,14 +9554,12 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
          "Not extracted from 128-/256-bit vector.");
 
   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
-  SmallVector<SDValue, 8> VecIns;
 
   for (DenseMap<SDValue, unsigned>::const_iterator
         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
     // Quit if not all elements are used.
     if (I->second != FullMask)
       return SDValue();
-    VecIns.push_back(I->first);
   }
 
   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
@@ -9444,6 +9587,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
                                     SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
+  if (Op.getValueType() == MVT::i1)
+    // KORTEST instruction should be selected
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
   bool NeedCF = false;
@@ -9460,15 +9608,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     NeedOF = true;
     break;
   }
-
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
-  if (Op.getResNo() != 0 || NeedOF || NeedCF)
+  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
     // Emit a CMP with 0, which is the TEST pattern.
+    //if (Op.getValueType() == MVT::i1)
+    //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
+    //                     DAG.getConstant(0, MVT::i1));
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, Op.getValueType()));
-
+  }
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
 
@@ -9653,13 +9803,30 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                    SelectionDAG &DAG) const {
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+  SDLoc dl(Op0);
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
     if (C->getAPIntValue() == 0)
       return EmitTest(Op0, X86CC, DAG);
 
-  SDLoc dl(Op0);
+     if (Op0.getValueType() == MVT::i1)
+       llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
+  }
+ 
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Do the comparison at i32 if it's smaller, besides the Atom case. 
+    // This avoids subregister aliasing issues. Keep the smaller reference 
+    // if we're optimizing for size, however, as that'll allow better folding 
+    // of memory operations.
+    if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+        !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+             AttributeSet::FunctionIndex, Attribute::MinSize) &&
+        !Subtarget->isAtom()) {
+      unsigned ExtendOp =
+          isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+    }
     // Use SUB instead of CMP to enable CSE between SUB and CMP.
     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
@@ -9844,38 +10011,75 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
-static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
 
   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
          Op.getValueType().getScalarType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  SDLoc dl(Op);
-
+  unsigned  Opc = 0;
   bool Unsigned = false;
+  bool Swap = false;
   unsigned SSECC;
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  SSECC = 4; break;
-  case ISD::SETEQ:  SSECC = 0; break;
-  case ISD::SETUGT: Unsigned = true;
-  case ISD::SETGT:  SSECC = 6; break; // NLE
-  case ISD::SETULT: Unsigned = true;
-  case ISD::SETLT:  SSECC = 1; break;
-  case ISD::SETUGE: Unsigned = true;
-  case ISD::SETGE:  SSECC = 5; break; // NLT
-  case ISD::SETULE: Unsigned = true;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
+  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+  case ISD::SETLT:  Swap = true; //fall-through
+  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
+  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
+  case ISD::SETULE: Unsigned = true; //fall-through
   case ISD::SETLE:  SSECC = 2; break;
   }
-  unsigned  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+
+  if (Swap)
+    std::swap(Op0, Op1);
+  if (Opc)
+    return DAG.getNode(Opc, dl, VT, Op0, Op1);
+  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
   return DAG.getNode(Opc, dl, VT, Op0, Op1,
                      DAG.getConstant(SSECC, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1.  If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
+{
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+  if (!BV)
+    return SDValue();
+
+  MVT VT = Op1.getSimpleValueType();
+  MVT EVT = VT.getVectorElementType();
+  unsigned n = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> ULTOp1;
+
+  for (unsigned i = 0; i < n; ++i) {
+    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
+      return SDValue();
+
+    // Avoid underflow.
+    APInt Val = Elt->getAPIntValue();
+    if (Val == 0)
+      return SDValue();
 
+    ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(),
+                     ULTOp1.size());
 }
 
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
@@ -9931,7 +10135,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (Subtarget->hasAVX512()) {
     if (Op1.getValueType().is512BitVector() ||
         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
-      return LowerIntVSETCC_AVX512(Op, DAG);
+      return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
 
     // In AVX-512 architecture setcc returns mask with i1 elements,
     // But there is no compare instruction for i8 and i16 elements.
@@ -9949,40 +10153,75 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   // operations may be required for some comparisons.
   unsigned Opc;
   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
-  
+  bool Subus = false;
+
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
                     Invert = true; break;
   case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
                     FlipSigns = true; break;
   case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT;
                     FlipSigns = true; Invert = true; break;
   }
-  
+
   // Special case: Use min/max operations for SETULE/SETUGE
   MVT VET = VT.getVectorElementType();
   bool hasMinMax =
        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
-  
+
   if (hasMinMax) {
     switch (SetCCOpcode) {
     default: break;
     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
     }
-    
+
     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   }
-  
+
+  bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  if (!MinMax && hasSubus) {
+    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+    // Op0 u<= Op1:
+    //   t = psubus Op0, Op1
+    //   pcmpeq t, <0..0>
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULT: {
+      // If the comparison is against a constant we can turn this into a
+      // setule.  With psubus, setule does not require a swap.  This is
+      // beneficial because the constant in the register is no longer
+      // destructed as the destination so it can be hoisted out of a loop.
+      // Only do this pre-AVX since vpcmp* is no longer destructive.
+      if (Subtarget->hasAVX())
+        break;
+      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG);
+      if (ULEOp1.getNode()) {
+        Op1 = ULEOp1;
+        Subus = true; Invert = false; Swap = false;
+      }
+      break;
+    }
+    // Psubus is better than flip-sign because it requires no inversion.
+    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
+    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+    }
+
+    if (Subus) {
+      Opc = X86ISD::SUBUS;
+      FlipSigns = false;
+    }
+  }
+
   if (Swap)
     std::swap(Op0, Op1);
 
@@ -10069,10 +10308,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   // If the logical-not of the result is required, perform that now.
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
-  
+
   if (MinMax)
     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
 
+  if (Subus)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+                         getZeroVector(VT, Subtarget, DAG, dl));
+
   return Result;
 }
 
@@ -10082,7 +10325,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
-  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+  assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+         && "SetCC type must be 8-bit or 1-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
@@ -10114,13 +10358,25 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
       bool Invert = (CC == ISD::SETNE) ^
         cast<ConstantSDNode>(Op1)->isNullValue();
-      if (!Invert) return Op0;
+      if (!Invert)
+        return Op0;
 
       CCode = X86::GetOppositeBranchCondition(CCode);
-      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                  DAG.getConstant(CCode, MVT::i8),
+                                  Op0.getOperand(1));
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+      return SetCC;
     }
   }
+  if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+    return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
+  }
 
   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
@@ -10129,8 +10385,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
-  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                              DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+  if (VT == MVT::i1)
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+  return SetCC;
 }
 
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
@@ -10195,8 +10454,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
 
     if (SSECC != 8) {
-      unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
-      SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
+      if (Subtarget->hasAVX512()) {
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
+                                  DAG.getConstant(SSECC, MVT::i8));
+        return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+      }
+      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
                                 DAG.getConstant(SSECC, MVT::i8));
       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
@@ -10433,7 +10696,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
     return SDValue();
 
   if (Subtarget->hasInt256())
-    return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   // Optimize vectors in AVX mode
   // Sign extend  v8i16 to v8i32 and
@@ -10462,8 +10725,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
                                 VT.getVectorNumElements()/2);
 
-  OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
-  OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -10576,11 +10839,26 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     unsigned X86Opcode;
     unsigned X86Cond;
     SDVTList VTs;
+    // Keep this in sync with LowerXALUO, otherwise we might create redundant
+    // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+    // X86ISD::INC).
     switch (CondOpcode) {
     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
-    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::SADDO:
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+        if (C->isOne()) {
+          X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
-    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::SSUBO:
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+        if (C->isOne()) {
+          X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
     default: llvm_unreachable("unexpected overflowing operator");
@@ -10764,11 +11042,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
-  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+  assert((Subtarget->isOSWindows() ||
           getTargetMachine().Options.EnableSegmentedStacks) &&
          "This should be used only on Windows targets or when segmented stacks "
          "are being used");
-  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
+  assert(!Subtarget->isTargetMacho() && "Not implemented");
   SDLoc dl(Op);
 
   // Get the inputs.
@@ -10979,14 +11257,15 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
 
 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
 // amount is a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
                                           SDValue SrcOp, uint64_t ShiftAmt,
                                           SelectionDAG &DAG) {
+  MVT ElementType = VT.getVectorElementType();
 
   // Check for ShiftAmt >= element width
-  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
+  if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
-      ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
+      ShiftAmt = ElementType.getSizeInBits() - 1;
     else
       return DAG.getConstant(0, VT);
   }
@@ -10994,12 +11273,63 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
          && "Unknown target vector shift-by-constant node");
 
+  // Fold this packed vector shift into a build vector if SrcOp is a
+  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+  if (VT == SrcOp.getSimpleValueType() &&
+      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    unsigned NumElts = SrcOp->getNumOperands();
+    ConstantSDNode *ND;
+
+    switch(Opc) {
+    default: llvm_unreachable(0);
+    case X86ISD::VSHLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
+      }
+      break;
+    case X86ISD::VSRLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
+      }
+      break;
+    case X86ISD::VSRAI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
+      }
+      break;
+    }
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+  }
+
   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
 }
 
 // getTargetVShiftNode - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    SelectionDAG &DAG) {
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
@@ -11027,7 +11357,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
 
   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
@@ -11210,32 +11540,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_pmaxu_b:
   case Intrinsic::x86_avx2_pmaxu_w:
   case Intrinsic::x86_avx2_pmaxu_d:
-  case Intrinsic::x86_avx512_pmaxu_d:
-  case Intrinsic::x86_avx512_pmaxu_q:
   case Intrinsic::x86_sse2_pminu_b:
   case Intrinsic::x86_sse41_pminuw:
   case Intrinsic::x86_sse41_pminud:
   case Intrinsic::x86_avx2_pminu_b:
   case Intrinsic::x86_avx2_pminu_w:
   case Intrinsic::x86_avx2_pminu_d:
-  case Intrinsic::x86_avx512_pminu_d:
-  case Intrinsic::x86_avx512_pminu_q:
   case Intrinsic::x86_sse41_pmaxsb:
   case Intrinsic::x86_sse2_pmaxs_w:
   case Intrinsic::x86_sse41_pmaxsd:
   case Intrinsic::x86_avx2_pmaxs_b:
   case Intrinsic::x86_avx2_pmaxs_w:
   case Intrinsic::x86_avx2_pmaxs_d:
-  case Intrinsic::x86_avx512_pmaxs_d:
-  case Intrinsic::x86_avx512_pmaxs_q:
   case Intrinsic::x86_sse41_pminsb:
   case Intrinsic::x86_sse2_pmins_w:
   case Intrinsic::x86_sse41_pminsd:
   case Intrinsic::x86_avx2_pmins_b:
   case Intrinsic::x86_avx2_pmins_w:
-  case Intrinsic::x86_avx2_pmins_d: 
-  case Intrinsic::x86_avx512_pmins_d:
-  case Intrinsic::x86_avx512_pmins_q: {
+  case Intrinsic::x86_avx2_pmins_d: {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -11245,8 +11567,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxu_b:
     case Intrinsic::x86_avx2_pmaxu_w:
     case Intrinsic::x86_avx2_pmaxu_d:
-    case Intrinsic::x86_avx512_pmaxu_d:
-    case Intrinsic::x86_avx512_pmaxu_q:
       Opcode = X86ISD::UMAX;
       break;
     case Intrinsic::x86_sse2_pminu_b:
@@ -11255,8 +11575,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pminu_b:
     case Intrinsic::x86_avx2_pminu_w:
     case Intrinsic::x86_avx2_pminu_d:
-    case Intrinsic::x86_avx512_pminu_d:
-    case Intrinsic::x86_avx512_pminu_q:
       Opcode = X86ISD::UMIN;
       break;
     case Intrinsic::x86_sse41_pmaxsb:
@@ -11265,8 +11583,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxs_b:
     case Intrinsic::x86_avx2_pmaxs_w:
     case Intrinsic::x86_avx2_pmaxs_d:
-    case Intrinsic::x86_avx512_pmaxs_d:
-    case Intrinsic::x86_avx512_pmaxs_q:
       Opcode = X86ISD::SMAX;
       break;
     case Intrinsic::x86_sse41_pminsb:
@@ -11275,8 +11591,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmins_b:
     case Intrinsic::x86_avx2_pmins_w:
     case Intrinsic::x86_avx2_pmins_d:
-    case Intrinsic::x86_avx512_pmins_d:
-    case Intrinsic::x86_avx512_pmins_q:
       Opcode = X86ISD::SMIN;
       break;
     }
@@ -11289,14 +11603,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_sse2_max_pd:
   case Intrinsic::x86_avx_max_ps_256:
   case Intrinsic::x86_avx_max_pd_256:
-  case Intrinsic::x86_avx512_max_ps_512:
-  case Intrinsic::x86_avx512_max_pd_512:
   case Intrinsic::x86_sse_min_ps:
   case Intrinsic::x86_sse2_min_pd:
   case Intrinsic::x86_avx_min_ps_256:
-  case Intrinsic::x86_avx_min_pd_256:
-  case Intrinsic::x86_avx512_min_ps_512:
-  case Intrinsic::x86_avx512_min_pd_512:  {
+  case Intrinsic::x86_avx_min_pd_256: {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -11304,16 +11614,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_sse2_max_pd:
     case Intrinsic::x86_avx_max_ps_256:
     case Intrinsic::x86_avx_max_pd_256:
-    case Intrinsic::x86_avx512_max_ps_512:
-    case Intrinsic::x86_avx512_max_pd_512:
       Opcode = X86ISD::FMAX;
       break;
     case Intrinsic::x86_sse_min_ps:
     case Intrinsic::x86_sse2_min_pd:
     case Intrinsic::x86_avx_min_ps_256:
     case Intrinsic::x86_avx_min_pd_256:
-    case Intrinsic::x86_avx512_min_ps_512:
-    case Intrinsic::x86_avx512_min_pd_512:
       Opcode = X86ISD::FMIN;
       break;
     }
@@ -11459,14 +11765,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
-  case Intrinsic::x86_avx512_kortestz:
-  case Intrinsic::x86_avx512_kortestc: {
-    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
+  case Intrinsic::x86_avx512_kortestz_w:
+  case Intrinsic::x86_avx512_kortestc_w: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
-    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
@@ -11560,7 +11866,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
       Opcode = X86ISD::VSRAI;
       break;
     }
-    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
+    return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(),
                                Op.getOperand(1), Op.getOperand(2), DAG);
   }
 
@@ -11675,7 +11981,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_fma_vfmaddsub_ps_512:
   case Intrinsic::x86_fma_vfmaddsub_pd_512:
   case Intrinsic::x86_fma_vfmsubadd_ps_512:
-  case Intrinsic::x86_fma_vfmsubadd_pd_512: { 
+  case Intrinsic::x86_fma_vfmsubadd_pd_512: {
     unsigned Opc;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -11743,9 +12049,9 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   assert(C && "Invalid scale type");
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
-  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
-  EVT MaskVT = MVT::getVectorVT(MVT::i1, 
-                                Index.getValueType().getVectorNumElements());
+  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
@@ -11765,13 +12071,13 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   assert(C && "Invalid scale type");
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                                Index.getValueType().getVectorNumElements());
+                             Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   if (Src.getOpcode() == ISD::UNDEF)
-    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
@@ -11788,7 +12094,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                                Index.getValueType().getVectorNumElements());
+                             Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
@@ -11806,7 +12112,7 @@ static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                                Index.getValueType().getVectorNumElements());
+                             Index.getSimpleValueType().getVectorNumElements());
   SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
@@ -11861,15 +12167,15 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case Intrinsic::x86_avx512_gather_dpi_512: {
     unsigned Opc;
     switch (IntNo) {
-      default: llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
-      case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
-      case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
-      case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
-      case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
-      case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
-      case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
-      case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
+    case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
+    case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
+    case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
+    case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
+    case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
+    case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
+    case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
     }
     SDValue Chain = Op.getOperand(0);
     SDValue Index = Op.getOperand(2);
@@ -11888,23 +12194,23 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case Intrinsic::x86_avx512_gather_dpq_mask_512: {
     unsigned Opc;
     switch (IntNo) {
-      default: llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_avx512_gather_qps_mask_512: 
-        Opc = X86::VGATHERQPSZrm; break;
-      case Intrinsic::x86_avx512_gather_qpd_mask_512:
-        Opc = X86::VGATHERQPDZrm; break;
-      case Intrinsic::x86_avx512_gather_dpd_mask_512:
-        Opc = X86::VGATHERDPDZrm; break;
-      case Intrinsic::x86_avx512_gather_dps_mask_512:
-        Opc = X86::VGATHERDPSZrm; break;
-      case Intrinsic::x86_avx512_gather_qpi_mask_512:
-        Opc = X86::VPGATHERQDZrm; break;
-      case Intrinsic::x86_avx512_gather_qpq_mask_512:
-        Opc = X86::VPGATHERQQZrm; break;
-      case Intrinsic::x86_avx512_gather_dpi_mask_512:
-        Opc = X86::VPGATHERDDZrm; break;
-      case Intrinsic::x86_avx512_gather_dpq_mask_512:
-        Opc = X86::VPGATHERDQZrm; break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx512_gather_qps_mask_512:
+      Opc = X86::VGATHERQPSZrm; break;
+    case Intrinsic::x86_avx512_gather_qpd_mask_512:
+      Opc = X86::VGATHERQPDZrm; break;
+    case Intrinsic::x86_avx512_gather_dpd_mask_512:
+      Opc = X86::VGATHERDPDZrm; break;
+    case Intrinsic::x86_avx512_gather_dps_mask_512:
+      Opc = X86::VGATHERDPSZrm; break;
+    case Intrinsic::x86_avx512_gather_qpi_mask_512:
+      Opc = X86::VPGATHERQDZrm; break;
+    case Intrinsic::x86_avx512_gather_qpq_mask_512:
+      Opc = X86::VPGATHERQQZrm; break;
+    case Intrinsic::x86_avx512_gather_dpi_mask_512:
+      Opc = X86::VPGATHERDDZrm; break;
+    case Intrinsic::x86_avx512_gather_dpq_mask_512:
+      Opc = X86::VPGATHERDQZrm; break;
     }
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
@@ -11926,23 +12232,23 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case Intrinsic::x86_avx512_scatter_dpi_512: {
     unsigned Opc;
     switch (IntNo) {
-      default: llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_avx512_scatter_qpd_512: 
-        Opc = X86::VSCATTERQPDZmr; break;
-      case Intrinsic::x86_avx512_scatter_qps_512:
-        Opc = X86::VSCATTERQPSZmr; break;
-      case Intrinsic::x86_avx512_scatter_dpd_512:
-        Opc = X86::VSCATTERDPDZmr; break;
-      case Intrinsic::x86_avx512_scatter_dps_512:
-        Opc = X86::VSCATTERDPSZmr; break;
-      case Intrinsic::x86_avx512_scatter_qpi_512:
-        Opc = X86::VPSCATTERQDZmr; break;
-      case Intrinsic::x86_avx512_scatter_qpq_512:
-        Opc = X86::VPSCATTERQQZmr; break;
-      case Intrinsic::x86_avx512_scatter_dpq_512:
-        Opc = X86::VPSCATTERDQZmr; break;
-      case Intrinsic::x86_avx512_scatter_dpi_512:
-        Opc = X86::VPSCATTERDDZmr; break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx512_scatter_qpd_512:
+      Opc = X86::VSCATTERQPDZmr; break;
+    case Intrinsic::x86_avx512_scatter_qps_512:
+      Opc = X86::VSCATTERQPSZmr; break;
+    case Intrinsic::x86_avx512_scatter_dpd_512:
+      Opc = X86::VSCATTERDPDZmr; break;
+    case Intrinsic::x86_avx512_scatter_dps_512:
+      Opc = X86::VSCATTERDPSZmr; break;
+    case Intrinsic::x86_avx512_scatter_qpi_512:
+      Opc = X86::VPSCATTERQDZmr; break;
+    case Intrinsic::x86_avx512_scatter_qpq_512:
+      Opc = X86::VPSCATTERQQZmr; break;
+    case Intrinsic::x86_avx512_scatter_dpq_512:
+      Opc = X86::VPSCATTERDQZmr; break;
+    case Intrinsic::x86_avx512_scatter_dpi_512:
+      Opc = X86::VPSCATTERDDZmr; break;
     }
     SDValue Chain = Op.getOperand(0);
     SDValue Base  = Op.getOperand(2);
@@ -11962,23 +12268,23 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
     unsigned Opc;
     switch (IntNo) {
-      default: llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_avx512_scatter_qpd_mask_512: 
-        Opc = X86::VSCATTERQPDZmr; break;
-      case Intrinsic::x86_avx512_scatter_qps_mask_512:
-        Opc = X86::VSCATTERQPSZmr; break;
-      case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-        Opc = X86::VSCATTERDPDZmr; break;
-      case Intrinsic::x86_avx512_scatter_dps_mask_512:
-        Opc = X86::VSCATTERDPSZmr; break;
-      case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-        Opc = X86::VPSCATTERQDZmr; break;
-      case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-        Opc = X86::VPSCATTERQQZmr; break;
-      case Intrinsic::x86_avx512_scatter_dpq_mask_512:
-        Opc = X86::VPSCATTERDQZmr; break;
-      case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-        Opc = X86::VPSCATTERDDZmr; break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx512_scatter_qpd_mask_512:
+      Opc = X86::VSCATTERQPDZmr; break;
+    case Intrinsic::x86_avx512_scatter_qps_mask_512:
+      Opc = X86::VSCATTERQPSZmr; break;
+    case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+      Opc = X86::VSCATTERDPDZmr; break;
+    case Intrinsic::x86_avx512_scatter_dps_mask_512:
+      Opc = X86::VSCATTERDPSZmr; break;
+    case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+      Opc = X86::VPSCATTERQDZmr; break;
+    case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+      Opc = X86::VPSCATTERQQZmr; break;
+    case Intrinsic::x86_avx512_scatter_dpq_mask_512:
+      Opc = X86::VPSCATTERDQZmr; break;
+    case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+      Opc = X86::VPSCATTERDDZmr; break;
     }
     SDValue Chain = Op.getOperand(0);
     SDValue Base  = Op.getOperand(2);
@@ -12007,6 +12313,9 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
   EVT PtrVT = getPointerTy();
@@ -12277,7 +12586,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   const TargetMachine &TM = MF.getTarget();
   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
 
   // Save FP Control Word to stack slot
@@ -12322,7 +12631,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 }
 
 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
@@ -12356,7 +12665,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
@@ -12381,7 +12690,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
   Op = Op.getOperand(0);
@@ -12403,7 +12712,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
 // ones, and then concatenate the result back.
 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
@@ -12421,8 +12730,8 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
 
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
-  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
@@ -12430,15 +12739,15 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getValueType().is256BitVector() &&
-         Op.getValueType().isInteger() &&
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getValueType().is256BitVector() &&
-         Op.getValueType().isInteger() &&
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
@@ -12446,7 +12755,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                         SelectionDAG &DAG) {
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget->hasInt256())
@@ -12516,8 +12825,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 }
 
 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-  EVT EltTy = VT.getVectorElementType();
+  MVT VT = Op.getSimpleValueType();
+  MVT EltTy = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
@@ -12572,7 +12881,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget *Subtarget) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
@@ -12604,7 +12913,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           // Make a large shift.
           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
                                                    MVT::v8i16, R, ShiftAmt,
-                                                   DAG); 
+                                                   DAG);
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -12699,7 +13008,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
-    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
                      VT.getVectorNumElements();
     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
     uint64_t ShiftAmt = 0;
@@ -12744,7 +13053,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget* Subtarget) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
@@ -12814,7 +13123,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       default:
         llvm_unreachable("Unknown shift opcode!");
       case ISD::SHL:
-        switch (VT.getSimpleVT().SimpleTy) {
+        switch (VT.SimpleTy) {
         default: return SDValue();
         case MVT::v2i64:
         case MVT::v4i32:
@@ -12827,7 +13136,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRA:
-        switch (VT.getSimpleVT().SimpleTy) {
+        switch (VT.SimpleTy) {
         default: return SDValue();
         case MVT::v4i32:
         case MVT::v8i16:
@@ -12838,7 +13147,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRL:
-        switch (VT.getSimpleVT().SimpleTy) {
+        switch (VT.SimpleTy) {
         default: return SDValue();
         case MVT::v2i64:
         case MVT::v4i32:
@@ -12861,7 +13170,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
-    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
                      VT.getVectorNumElements();
     std::vector<SDValue> Vals(Ratio);
     for (unsigned i = 0; i != Ratio; ++i)
@@ -12889,7 +13198,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                           SelectionDAG &DAG) {
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
@@ -12922,6 +13231,39 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       return Op;
   }
 
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  // Do this only if the vector shift count is a constant build_vector.
+  if (Op.getOpcode() == ISD::SHL && 
+      (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+       (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    EVT SVT = VT.getScalarType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    const APInt &One = APInt(SVTBits, 1);
+    unsigned NumElems = VT.getVectorNumElements();
+
+    for (unsigned i=0; i !=NumElems; ++i) {
+      SDValue Op = Amt->getOperand(i);
+      if (Op->getOpcode() == ISD::UNDEF) {
+        Elts.push_back(Op);
+        continue;
+      }
+
+      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+      const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
+      uint64_t ShAmt = C.getZExtValue();
+      if (ShAmt >= SVTBits) {
+        Elts.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
+    }
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+  }
+
   // Lower SHL with variable shift amount.
   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
@@ -12931,6 +13273,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
+
   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
 
@@ -12974,10 +13317,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     return R;
   }
 
+  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+  // solution better.
+  if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+    MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
+    unsigned ExtOpc =
+        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    R = DAG.getNode(ExtOpc, dl, NewVT, R);
+    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
+    }
+
   // Decompose 256-bit shifts into smaller 128-bit shifts.
   if (VT.is256BitVector()) {
     unsigned NumElems = VT.getVectorNumElements();
-    MVT EltVT = VT.getVectorElementType().getSimpleVT();
+    MVT EltVT = VT.getVectorElementType();
     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
     // Extract the two vectors
@@ -13095,7 +13451,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   if (!Subtarget->hasSSE2() || !VT.isVector())
     return SDValue();
@@ -13103,7 +13459,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                       ExtraVT.getScalarType().getSizeInBits();
 
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
     default: return SDValue();
     case MVT::v8i32:
     case MVT::v16i16:
@@ -13118,7 +13474,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
-        MVT EltVT = VT.getVectorElementType().getSimpleVT();
+        MVT EltVT = VT.getVectorElementType();
         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
         EVT ExtraEltVT = ExtraVT.getVectorElementType();
@@ -13205,11 +13561,11 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
 
 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
-  EVT T = Op.getValueType();
+  MVT T = Op.getSimpleValueType();
   SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
-  switch(T.getSimpleVT().SimpleTy) {
+  switch(T.SimpleTy) {
   default: llvm_unreachable("Invalid value type!");
   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   case MVT::i16: Reg = X86::AX;  size = 2; break;
@@ -13317,7 +13673,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getNode()->getValueType(0);
+  EVT VT = Op.getNode()->getSimpleValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -13497,6 +13853,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
                                Node->getOperand(1), Zero, Zero,
                                cast<AtomicSDNode>(Node)->getMemOperand(),
                                cast<AtomicSDNode>(Node)->getOrdering(),
+                               cast<AtomicSDNode>(Node)->getOrdering(),
                                cast<AtomicSDNode>(Node)->getSynchScope());
   Results.push_back(Swap.getValue(0));
   Results.push_back(Swap.getValue(1));
@@ -13735,8 +14092,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
-  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
-  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
+  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
@@ -13788,7 +14144,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
   case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
-  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
@@ -13821,17 +14176,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::BLSI:               return "X86ISD::BLSI";
-  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
-  case X86ISD::BLSR:               return "X86ISD::BLSR";
   case X86ISD::BZHI:               return "X86ISD::BZHI";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
   case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
-  case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
@@ -13855,6 +14207,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
+  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
@@ -13932,6 +14285,24 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return true;
 }
 
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+  unsigned Bits = Ty->getScalarSizeInBits();
+
+  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+  // particularly cheaper than those without.
+  if (Bits == 8)
+    return false;
+
+  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+  // variable shifts just as cheap as scalar ones.
+  if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+    return false;
+
+  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+  // fully general vector.
+  return true;
+}
+
 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
@@ -14115,7 +14486,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
@@ -14349,7 +14720,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
@@ -14635,7 +15006,7 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
@@ -15031,8 +15402,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
 
     // Transfer the remainder of MBB and its successor edges to endMBB.
     endMBB->splice(endMBB->begin(), thisMBB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    thisMBB->end());
+                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
     // Make offsetMBB and overflowMBB successors of thisMBB
@@ -15202,8 +15572,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
 
   // Transfer the remainder of MBB and its successor edges to EndMBB.
   EndMBB->splice(EndMBB->begin(), MBB,
-                 llvm::next(MachineBasicBlock::iterator(MI)),
-                 MBB->end());
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // The original block will now fall through to the XMM save block.
@@ -15226,9 +15595,15 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
     MBB->addSuccessor(EndMBB);
   }
 
+  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+  // that was just emitted, but clearly shouldn't be "saved".
+  assert((MI->getNumOperands() <= 3 ||
+          !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
+          MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
+         && "Expected last argument to be EFLAGS");
   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
-  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
     MachineMemOperand *MMO =
       F->getMachineMemOperand(
@@ -15259,7 +15634,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
                                      MachineBasicBlock* BB,
                                      const TargetRegisterInfo* TRI) {
   // Scan forward through BB for a use/def of EFLAGS.
-  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
     const MachineInstr& mi = *miI;
     if (mi.readsRegister(X86::EFLAGS))
@@ -15324,8 +15699,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Add the true and fallthrough blocks as its successors.
@@ -15405,8 +15779,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   MF->insert(MBBIter, mallocMBB);
   MF->insert(MBBIter, continueMBB);
 
-  continueMBB->splice(continueMBB->begin(), BB, llvm::next
-                      (MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->splice(continueMBB->begin(), BB,
+                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Add code to the main basic block to check if the stack limit has been hit,
@@ -15481,7 +15855,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  assert(!Subtarget->isTargetEnvMacho());
+  assert(!Subtarget->isTargetMacho());
 
   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
   // non-trivial part is impdef of ESP.
@@ -15512,7 +15886,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
     }
   } else {
     const char *StackProbeSymbol =
-      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
+      Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
 
     BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
       .addExternalSymbol(StackProbeSymbol)
@@ -15647,7 +16021,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
-                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
@@ -15788,6 +16162,89 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   return MBB;
 }
 
+// Replace 213-type (isel default) FMA3 instructions with 231-type for
+// accumulator loops. Writing back to the accumulator allows the coalescer
+// to remove extra copies in the loop.   
+MachineBasicBlock *
+X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) const {
+  MachineOperand &AddendOp = MI->getOperand(3);
+
+  // Bail out early if the addend isn't a register - we can't switch these.
+  if (!AddendOp.isReg())
+    return MBB;
+
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Check whether the addend is defined by a PHI:
+  assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
+  MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
+  if (!AddendDef.isPHI())
+    return MBB;
+
+  // Look for the following pattern:
+  // loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
+
+  // Replace with:
+  //   loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
+
+  for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
+    assert(AddendDef.getOperand(i).isReg());
+    MachineOperand PHISrcOp = AddendDef.getOperand(i);
+    MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
+    if (&PHISrcInst == MI) {
+      // Found a matching instruction.
+      unsigned NewFMAOpc = 0;
+      switch (MI->getOpcode()) {
+        case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
+        case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
+        case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
+        case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
+        case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
+        case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
+        case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
+        case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
+        case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
+        case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
+        case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
+        case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
+        case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
+        case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
+        case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
+        case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+        case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
+        case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
+        case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
+        case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
+        case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
+        case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
+        case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
+        case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+        default: llvm_unreachable("Unrecognized FMA variant.");
+      }
+
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      MachineInstrBuilder MIB =
+        BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(3))
+        .addOperand(MI->getOperand(2))
+        .addOperand(MI->getOperand(1));
+      MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+      MI->eraseFromParent();
+    }
+  }
+
+  return MBB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
@@ -16015,6 +16472,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::EH_SjLj_LongJmp32:
   case X86::EH_SjLj_LongJmp64:
     return emitEHSjLjLongJmp(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+
+  case X86::VFMADDPDr213r:
+  case X86::VFMADDPSr213r:
+  case X86::VFMADDSDr213r:
+  case X86::VFMADDSSr213r:
+  case X86::VFMSUBPDr213r:
+  case X86::VFMSUBPSr213r:
+  case X86::VFMSUBSDr213r:
+  case X86::VFMSUBSSr213r:
+  case X86::VFNMADDPDr213r:
+  case X86::VFNMADDPSr213r:
+  case X86::VFNMADDSDr213r:
+  case X86::VFNMADDSSr213r:
+  case X86::VFNMSUBPDr213r:
+  case X86::VFNMSUBPSr213r:
+  case X86::VFNMSUBSDr213r:
+  case X86::VFNMSUBSSr213r:
+  case X86::VFMADDPDr213rY:
+  case X86::VFMADDPSr213rY:
+  case X86::VFMSUBPDr213rY:
+  case X86::VFMSUBPSr213rY:
+  case X86::VFNMADDPDr213rY:
+  case X86::VFNMADDPSr213rY:
+  case X86::VFNMSUBPDr213rY:
+  case X86::VFNMSUBPSr213rY:
+    return emitFMA3Instr(MI, BB);
   }
 }
 
@@ -16377,44 +16864,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
-/// Extract one bit from mask vector, like v16i1 or v8i1.
-/// AVX-512 feature.
-static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) {
-  SDValue Vec = N->getOperand(0);
-  SDLoc dl(Vec);
-  MVT VecVT = Vec.getSimpleValueType();
-  SDValue Idx = N->getOperand(1);
-  MVT EltVT = N->getSimpleValueType(0);
-  
-  assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) ||
-         "Unexpected operands in ExtractBitFromMaskVector");
-
-  // variable index
-  if (!isa<ConstantSDNode>(Idx)) {
-    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              ExtVT.getVectorElementType(), Ext);
-    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
-  }
-
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
-  MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits());
-  unsigned MaxShift = VecVT.getSizeInBits() - 1;
-  Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec);
-  Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 
-              DAG.getConstant(MaxShift - IdxVal, ScalarVT));
-  Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec,
-    DAG.getConstant(MaxShift, ScalarVT));
-
-  if (VecVT == MVT::v16i1) {
-    Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec);
-    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec);
-  }
-  return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec);
-}
-
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
 /// to a simple store and scalar loads to extract the elements.
@@ -16426,10 +16875,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue InputVector = N->getOperand(0);
 
-  if (InputVector.getValueType().getVectorElementType() == MVT::i1 &&
-      !DCI.isBeforeLegalize())
-    return ExtractBitFromMaskVector(N, DAG);
-
   // Detect whether we are trying to convert from mmx to i32 and the bitcast
   // from mmx to v2i32 has a single usage.
   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
@@ -16975,12 +17420,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // Check if SETCC has already been promoted
-      TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
+      TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
+      // Check that condition value type matches vselect operand type
+      CondVT == VT) { 
 
     assert(Cond.getValueType().isVector() &&
            "vector select expects a vector selector!");
 
-    EVT IntVT = Cond.getValueType();
     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
 
@@ -16995,7 +17441,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         ISD::CondCode NewCC =
           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                Cond.getOperand(0).getValueType().isInteger());
-        Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+        Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
         std::swap(LHS, RHS);
         TValIsAllOnes = FValIsAllOnes;
         FValIsAllZeros = TValIsAllZeros;
@@ -17008,16 +17454,91 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       if (TValIsAllOnes && FValIsAllZeros)
         Ret = Cond;
       else if (TValIsAllOnes)
-        Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
+        Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
       else if (FValIsAllZeros)
-        Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
+        Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
 
       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
     }
   }
 
+  // Try to fold this VSELECT into a MOVSS/MOVSD
+  if (N->getOpcode() == ISD::VSELECT &&
+      Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
+    if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
+        (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
+      bool CanFold = false;
+      unsigned NumElems = Cond.getNumOperands();
+      SDValue A = LHS;
+      SDValue B = RHS;
+      
+      if (isZero(Cond.getOperand(0))) {
+        CanFold = true;
+
+        // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
+        // fold (vselect <0,-1> -> (movsd A, B)
+        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
+          CanFold = isAllOnes(Cond.getOperand(i));
+      } else if (isAllOnes(Cond.getOperand(0))) {
+        CanFold = true;
+        std::swap(A, B);
+
+        // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
+        // fold (vselect <-1,0> -> (movsd B, A)
+        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
+          CanFold = isZero(Cond.getOperand(i));
+      }
+
+      if (CanFold) {
+        if (VT == MVT::v4i32 || VT == MVT::v4f32)
+          return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
+        return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
+      }
+
+      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
+        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
+        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
+        //                             (v2i64 (bitcast B)))))
+        //
+        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
+        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
+        //                             (v2f64 (bitcast B)))))
+        //
+        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
+        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
+        //                             (v2i64 (bitcast A)))))
+        //
+        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
+        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
+        //                             (v2f64 (bitcast A)))))
+
+        CanFold = (isZero(Cond.getOperand(0)) &&
+                   isZero(Cond.getOperand(1)) &&
+                   isAllOnes(Cond.getOperand(2)) &&
+                   isAllOnes(Cond.getOperand(3)));
+
+        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
+            isAllOnes(Cond.getOperand(1)) &&
+            isZero(Cond.getOperand(2)) &&
+            isZero(Cond.getOperand(3))) {
+          CanFold = true;
+          std::swap(LHS, RHS);
+        }
+
+        if (CanFold) {
+          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
+          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
+          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
+          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
+                                                NewB, DAG);
+          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
+        }
+      }
+    }
+  }
+
   // If we know that this node is legal then we know that it is going to be
   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -17466,7 +17987,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
 }
 
 /// \brief Returns a vector of 0s if the node in input is a vector logical
-/// shift by a constant amount which is known to be bigger than or equal 
+/// shift by a constant amount which is known to be bigger than or equal
 /// to the vector element size in bits.
 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget *Subtarget) {
@@ -17486,7 +18007,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
 
       // SSE2/AVX2 logical shifts always return a vector of 0s
-      // if the shift amount is bigger than or equal to 
+      // if the shift amount is bigger than or equal to
       // the element size. The constant shift amount will be
       // encoded as a 8-bit immediate.
       if (ShiftAmt.trunc(8).uge(MaxAmount))
@@ -17571,18 +18092,42 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
 
         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
-          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
-          X86ISD::NodeType NTOperator = is64BitFP ?
-            X86ISD::FSETCCsd : X86ISD::FSETCCss;
           // FIXME: need symbolic constants for these magic numbers.
           // See X86ATTInstPrinter.cpp:printSSECC().
           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
-          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+          if (Subtarget->hasAVX512()) {
+            SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
+                                         CMP01, DAG.getConstant(x86cc, MVT::i8));
+            if (N->getValueType(0) != MVT::i1)
+              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+                                 FSetCC);
+            return FSetCC;
+          }
+          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+                                              CMP00.getValueType(), CMP00, CMP01,
                                               DAG.getConstant(x86cc, MVT::i8));
-          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
-                                              OnesOrZeroesF);
-          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
-                                      DAG.getConstant(1, MVT::i32));
+
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+          if (is64BitFP && !Subtarget->is64Bit()) {
+            // On a 32-bit target, we cannot bitcast the 64-bit float to a
+            // 64-bit integer, since that's not a legal type. Since
+            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+            // bits, but can do this little dance to extract the lowest 32 bits
+            // and work with those going forward.
+            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+                                           OnesOrZeroesF);
+            SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
+                                           Vector64);
+            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+                                        Vector32, DAG.getIntPtrConstant(0));
+            IntVT = MVT::i32;
+          }
+
+          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+                                      DAG.getConstant(1, IntVT));
           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
           return OneBitOfTruth;
         }
@@ -17715,9 +18260,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
-  // Create BLSI, BLSR, and BZHI instructions
-  // BLSI is X & (-X)
-  // BLSR is X & (X-1)
+  // Create BEXTR and BZHI instructions
   // BZHI is X & ((1 << Y) - 1)
   // BEXTR is ((X >> imm) & (2**size-1))
   if (VT == MVT::i32 || VT == MVT::i64) {
@@ -17725,28 +18268,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
     SDValue N1 = N->getOperand(1);
     SDLoc DL(N);
 
-    if (Subtarget->hasBMI()) {
-      // Check LHS for neg
-      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
-          isZero(N0.getOperand(0)))
-        return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
-
-      // Check RHS for neg
-      if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
-          isZero(N1.getOperand(0)))
-        return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
-
-      // Check LHS for X-1
-      if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
-          isAllOnes(N0.getOperand(1)))
-        return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
-
-      // Check RHS for X-1
-      if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
-          isAllOnes(N1.getOperand(1)))
-        return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
-    }
-
     if (Subtarget->hasBMI2()) {
       // Check for (and (add (shl 1, Y), -1), X)
       if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
@@ -17822,7 +18343,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -17832,6 +18352,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
 
   // look for psign/blend
   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
@@ -17917,6 +18438,18 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  // SHLD/SHRD instructions have lower register pressure, but on some
+  // platforms they have higher latency than the equivalent
+  // series of shifts/or that would otherwise be generated.
+  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+  // have higher latencies and we are not optimizing for size.
+  if (!OptForSize && Subtarget->isSHLDSlow())
+    return SDValue();
+
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
@@ -18010,7 +18543,6 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -18020,28 +18552,6 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
       return RV;
   }
 
-  // Try forming BMI if it is available.
-  if (!Subtarget->hasBMI())
-    return SDValue();
-
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
-
-  // Create BLSMSK instructions by finding X ^ (X-1)
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
-  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
-      isAllOnes(N0.getOperand(1)))
-    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
-
-  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
-      isAllOnes(N1.getOperand(1)))
-    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
-
   return SDValue();
 }
 
@@ -18845,6 +19355,17 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, VT));
+    }
+  }
   if (VT.is256BitVector()) {
     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
     if (R.getNode())
@@ -18856,10 +19377,13 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
 
 // Optimize x == -y --> x+y == 0
 //          x != -y --> x+y != 0
-static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget* Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
@@ -18877,17 +19401,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
                             addV, DAG.getConstant(0, addV.getValueType()), CC);
       }
+
+  if (VT.getScalarType() == MVT::i1) {
+    bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+      (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
+    bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
+    if (!IsSEXT0 && !IsVZero0)
+      return SDValue();
+    bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
+      (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
+    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (!IsSEXT1 && !IsVZero1)
+      return SDValue();
+
+    if (IsSEXT0 && IsVZero1) {
+      assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
+      if (CC == ISD::SETEQ)
+        return DAG.getNOT(DL, LHS.getOperand(0), VT);
+      return LHS.getOperand(0);
+    }
+    if (IsSEXT1 && IsVZero0) {
+      assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
+      if (CC == ISD::SETEQ)
+        return DAG.getNOT(DL, RHS.getOperand(0), VT);
+      return RHS.getOperand(0);
+    }
+  }
+
   return SDValue();
 }
 
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
-  return DAG.getNode(ISD::AND, DL, MVT::i8,
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
+                               MVT VT) {
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT,
+                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                   DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+                       DAG.getConstant(1, VT));
+  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
-                     DAG.getConstant(1, MVT::i8));
+                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
@@ -18912,7 +19470,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-      return MaterializeSETB(DL, NewEFLAGS, DAG);
+      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
     }
   }
 
@@ -18920,7 +19478,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   // cases.
   if (CC == X86::COND_B)
-    return MaterializeSETB(DL, EFLAGS, DAG);
+    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
 
   SDValue Flags;
 
@@ -19155,7 +19713,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
-  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
@@ -19808,8 +20366,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   if (Res.second == 0) {
     // Map st(0) -> st(7) -> ST0
     if (Constraint.size() == 7 && Constraint[0] == '{' &&
-        std::tolower(Constraint[1]) == 's' &&
-        std::tolower(Constraint[2]) == 't' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
         Constraint[3] == '(' &&
         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
         Constraint[5] == ')' &&
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index bc3dd60..0f0d17b 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -15,16 +15,15 @@
 #ifndef X86ISELLOWERING_H
 #define X86ISELLOWERING_H
 
-#include "X86MachineFunctionInfo.h"
-#include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 
 namespace llvm {
+  class X86TargetMachine;
+
   namespace X86ISD {
     // X86 Specific DAG Nodes
     enum NodeType {
@@ -94,6 +93,9 @@ namespace llvm {
       /// operand, usually produced by a CMP instruction.
       SETCC,
 
+      /// X86 Select
+      SELECT,
+
       // Same as SETCC except it's materialized with a sbb and the value is all
       // one's or all zero's.
       SETCC_CARRY,  // R = carry_bit ? ~0 : 0
@@ -101,7 +103,7 @@ namespace llvm {
       /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
       /// Operands are two FP values to compare; result is a mask of
       /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
-      FSETCCss, FSETCCsd,
+      FSETCC,
 
       /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
       /// result in an integer GPR.  Needs masking for scalar result.
@@ -242,12 +244,9 @@ namespace llvm {
       /// the list of operands.
       TC_RETURN,
 
-      // VZEXT_MOVL - Vector move low and zero extend.
+      // VZEXT_MOVL - Vector move to low scalar and zero higher vector elements.
       VZEXT_MOVL,
 
-      // VSEXT_MOVL - Vector move low and sign extend.
-      VSEXT_MOVL,
-
       // VZEXT - Vector integer zero-extend.
       VZEXT,
 
@@ -292,9 +291,6 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
-      BLSI,   // BLSI - Extract lowest set isolated bit
-      BLSMSK, // BLSMSK - Get mask up to lowest set bit
-      BLSR,   // BLSR - Reset lowest set bit
       BZHI,   // BZHI - Zero high bits
       BEXTR,  // BEXTR - Bit field extract
 
@@ -309,12 +305,12 @@ namespace llvm {
       // TESTP - Vector packed fp sign bitwise comparisons.
       TESTP,
 
-      // TESTM - Vector "test" in AVX-512, the result is in a mask vector.
+      // TESTM, TESTNM - Vector "test" in AVX-512, the result is in a mask vector.
       TESTM,
+      TESTNM,
 
       // OR/AND test for masks
       KORTEST,
-      KTEST,
 
       // Several flavors of instructions with vector shuffle behaviors.
       PALIGNR,
@@ -337,12 +333,15 @@ namespace llvm {
       VPERMILP,
       VPERMV,
       VPERMV3,
+      VPERMIV3,
       VPERMI,
       VPERM2X128,
       VBROADCAST,
       // masked broadcast
       VBROADCASTM,
+      // Insert/Extract vector element
       VINSERT,
+      VEXTRACT,
 
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
@@ -522,32 +521,32 @@ namespace llvm {
 
   //===--------------------------------------------------------------------===//
   //  X86TargetLowering - X86 Implementation of the TargetLowering interface
-  class X86TargetLowering : public TargetLowering {
+  class X86TargetLowering final : public TargetLowering {
   public:
     explicit X86TargetLowering(X86TargetMachine &TM);
 
-    virtual unsigned getJumpTableEncoding() const;
+    unsigned getJumpTableEncoding() const override;
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
 
-    virtual const MCExpr *
+    const MCExpr *
     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                               const MachineBasicBlock *MBB, unsigned uid,
-                              MCContext &Ctx) const;
+                              MCContext &Ctx) const override;
 
     /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
     /// jumptable.
-    virtual SDValue getPICJumpTableRelocBase(SDValue Table,
-                                             SelectionDAG &DAG) const;
-    virtual const MCExpr *
+    SDValue getPICJumpTableRelocBase(SDValue Table,
+                                     SelectionDAG &DAG) const override;
+    const MCExpr *
     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
-                                 unsigned JTI, MCContext &Ctx) const;
+                                 unsigned JTI, MCContext &Ctx) const override;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
-    virtual unsigned getByValTypeAlignment(Type *Ty) const;
+    unsigned getByValTypeAlignment(Type *Ty) const override;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -560,10 +559,9 @@ namespace llvm {
     /// source is constant so it does not need to be loaded.
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
-    virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                        MachineFunction &MF) const;
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
 
     /// isSafeMemOpType - Returns true if it's safe to use load / store of the
     /// specified type to expand memcpy / memset inline. This is mostly true
@@ -571,88 +569,91 @@ namespace llvm {
     /// targets without SSE2 f64 load / store are done with fldl / fstpl which
     /// also does type conversion. Note the specified type doesn't have to be
     /// legal as the hook is used before type legalization.
-    virtual bool isSafeMemOpType(MVT VT) const;
+    bool isSafeMemOpType(MVT VT) const override;
 
     /// allowsUnalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses. of the specified type. Returns whether it
     /// is "fast" by reference in the second argument.
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                       bool *Fast) const override;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     /// isTypeDesirableForOp - Return true if the target has native support for
     /// the specified value type and it is 'desirable' to use the type for the
     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
     /// instruction encodings are longer and some i16 instructions are slow.
-    virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const;
+    bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
 
     /// isTypeDesirable - Return true if the target has native support for the
     /// specified value type and it is 'desirable' to use the type. e.g. On x86
     /// i16 is legal, but undesirable since i16 instruction encodings are longer
     /// and some i16 instructions are slow.
-    virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const;
+    bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
-    virtual MachineBasicBlock *
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
 
 
     /// getTargetNodeName - This method returns the name of a target specific
     /// DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    void computeMaskedBitsForTargetNode(const SDValue Op,
+                                        APInt &KnownZero,
+                                        APInt &KnownOne,
+                                        const SelectionDAG &DAG,
+                                        unsigned Depth = 0) const override;
 
     // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
     // operation that are sign bits.
-    virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
-                                                     unsigned Depth) const;
+    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             unsigned Depth) const override;
 
-    virtual bool
-    isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
+    bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+                        int64_t &Offset) const override;
 
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
-    virtual bool ExpandInlineAsm(CallInst *CI) const;
+    bool ExpandInlineAsm(CallInst *CI) const override;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
-    virtual ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
+    ConstraintWeight
+      getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                     const char *constraint) const override;
 
-    virtual const char *LowerXConstraint(EVT ConstraintVT) const;
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
-    virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              std::string &Constraint,
-                                              std::vector<SDValue> &Ops,
-                                              SelectionDAG &DAG) const;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
 
     /// getRegForInlineAsmConstraint - Given a physical register constraint
     /// (e.g. {edx}), return the register number and the register class for the
@@ -660,31 +661,34 @@ namespace llvm {
     /// error, this returns a register number of 0.
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   MVT VT) const override;
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
-    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+    bool isLegalICmpImmediate(int64_t Imm) const override;
 
     /// isLegalAddImmediate - Return true if the specified immediate is legal
     /// add immediate, that is the target has add instructions which can
     /// add a register and the immediate without having to materialize
     /// the immediate into a register.
-    virtual bool isLegalAddImmediate(int64_t Imm) const;
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+
+    bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
     /// isTruncateFree - Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
-    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
-    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
     /// isZExtFree - Return true if any actual instruction that defines a
     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
@@ -694,44 +698,44 @@ namespace llvm {
     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
     /// all instructions that define 32-bit values implicit zero-extend the
     /// result out to 64 bits.
-    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
-    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
     /// expanded to fmul + fadd.
-    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
     /// isNarrowingProfitable - Return true if it's profitable to narrow
     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
     /// from i32 to i8 but not from i32 to i16.
-    virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const;
+    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isShuffleMaskLegal - Targets can use this to indicate that they only
     /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
     /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
     /// values are assumed to be legal.
-    virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
-                                    EVT VT) const;
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override;
 
     /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
     /// used by Targets can use this to indicate if there is a suitable
     /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
     /// pool entry.
-    virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
-                                        EVT VT) const;
+    bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                EVT VT) const override;
 
     /// ShouldShrinkFPConstant - If true, then instruction selection should
     /// seek to shrink the FP constant of the specified type to a smaller type
     /// in order to save space and / or reduce runtime.
-    virtual bool ShouldShrinkFPConstant(EVT VT) const {
+    bool ShouldShrinkFPConstant(EVT VT) const override {
       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
       // expensive than a straight movsd. On the other hand, it's important to
       // shrink long double fp constant since fldt is very slow.
@@ -752,7 +756,7 @@ namespace llvm {
     /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
     /// for fptoui.
     bool isTargetFTOL() const {
-      return Subtarget->isTargetWindows() && !Subtarget->is64Bit();
+      return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
     }
 
     /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
@@ -761,28 +765,39 @@ namespace llvm {
       return isTargetFTOL() && VT == MVT::i64;
     }
 
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    /// Intel processors have a unified instruction and data cache
+    const char * getClearCacheBuiltinName() const {
+      return 0; // nothing to do, move along.
+    }
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
-    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                                     const TargetLibraryInfo *libInfo) const;
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
 
     /// getStackCookieLocation - Return true if the target stores stack
     /// protector cookies at a fixed offset in some non-standard address
     /// space, and populates the address space and offset as
     /// appropriate.
-    virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
+    bool getStackCookieLocation(unsigned &AddressSpace,
+                                unsigned &Offset) const override;
 
     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG) const;
 
-    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const LLVM_OVERRIDE;
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
     /// \brief Reset the operation actions based on target options.
-    virtual void resetOperationActions();
+    void resetOperationActions() override;
 
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
-    findRepresentativeClass(MVT VT) const;
+    findRepresentativeClass(MVT VT) const override;
 
   private:
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
@@ -857,6 +872,7 @@ namespace llvm {
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -865,7 +881,6 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
@@ -874,9 +889,6 @@ namespace llvm {
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       SDLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -897,37 +909,34 @@ namespace llvm {
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
-    virtual SDValue
-      LowerCall(CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
 
-    virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
-    virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
-    virtual MVT
-    getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const;
+    MVT getTypeForExtArgOrReturn(MVT VT,
+                                 ISD::NodeType ExtendKind) const override;
 
-    virtual bool
-    CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                   bool isVarArg,
-                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   LLVMContext &Context) const;
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
 
-    virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const;
+    const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
 
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
@@ -973,6 +982,9 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
                                          MachineBasicBlock *MBB) const;
 
+    MachineBasicBlock *emitFMA3Instr(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const;
+
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index cb19fbd..2c5edf6 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -90,16 +90,17 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16f32 immAllZerosV))]>;
 }
 
+let Predicates = [HasAVX512] in {
 def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
 def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
-def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
 // -- 32x8 form --
-let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst),
           (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
           "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -112,7 +113,7 @@ def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst),
 }
 
 // -- 64x4 fp form --
-let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in {
+let hasSideEffects = 0, ExeDomain = SSEPackedDouble in {
 def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst),
           (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
           "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -124,7 +125,7 @@ def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst),
           []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
 }
 // -- 32x4 integer form --
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst),
           (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
           "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -137,7 +138,7 @@ def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst),
 
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 // -- 64x4 form --
 def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst),
           (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
@@ -162,12 +163,12 @@ def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v2i64 VR128X:$src2),
 def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2),
            (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
                         (INSERT_get_vinsert128_imm VR512:$ins))>;
-			
+
 def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2),
            (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
                         (INSERT_get_vinsert128_imm VR512:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1),
-	                (bc_v4i32 (loadv2i64 addr:$src2)),
+                  (bc_v4i32 (loadv2i64 addr:$src2)),
            (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
                         (INSERT_get_vinsert128_imm VR512:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (loadv2f64 addr:$src2),
@@ -207,12 +208,12 @@ def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
 // vinsertps - insert f32 to XMM
 def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
-      "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
       EVEX_4V;
 def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
-      "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insrtps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                           imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
@@ -220,7 +221,7 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
 //===----------------------------------------------------------------------===//
 // AVX-512 VECTOR EXTRACT
 //---
-let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 // -- 32x4 form --
 def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst),
           (ins VR512:$src1, i8imm:$src2),
@@ -243,7 +244,7 @@ def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs),
           []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
 }
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 // -- 32x4 form --
 def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst),
           (ins VR512:$src1, i8imm:$src2),
@@ -352,15 +353,15 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
 // vextractps - extract 32 bits from XMM
 def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
       (ins VR128X:$src1, u32u8imm:$src2),
-      "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
       EVEX;
 
 def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
       (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2),
-      "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
-                          addr:$dst)]>, EVEX;
+                          addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
 
 //===---------------------------------------------------------------------===//
 // AVX-512 BROADCAST
@@ -369,19 +370,19 @@ multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr,
                          RegisterClass DestRC,
                          RegisterClass SrcRC, X86MemOperand x86memop> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
-         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+         !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
          []>, EVEX;
   def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
-        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX;
+        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),[]>, EVEX;
 }
 let ExeDomain = SSEPackedSingle in {
-  defm VBROADCASTSSZ  : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512, 
+  defm VBROADCASTSSZ  : avx512_fp_broadcast<0x18, "vbroadcastss", VR512,
                                        VR128X, f32mem>,
                                        EVEX_V512, EVEX_CD8<32, CD8VT1>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VBROADCASTSDZ  : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512,
+  defm VBROADCASTSDZ  : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512,
                                        VR128X, f64mem>,
                                        EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 }
@@ -399,12 +400,12 @@ def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
 multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
                           RegisterClass SrcRC, RegisterClass KRC> {
   def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                    []>, EVEX, EVEX_V512;
   def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), 
                    (ins KRC:$mask, SrcRC:$src),
                    !strconcat(OpcodeStr, 
-                        "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                        " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
                    []>, EVEX, EVEX_V512, EVEX_KZ;
 }
 
@@ -420,6 +421,8 @@ def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
 
 def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
         (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))),
+        (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>;
 def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
         (VPBROADCASTQrZrr GR64:$src)>;
 def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
@@ -430,30 +433,37 @@ def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
 def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
         (VPBROADCASTQrZrr GR64:$src)>;
 
+def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src),
+                   (v16i32 immAllZerosV), (i16 GR16:$mask))),
+          (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
+                   (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
+          (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
+
 multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86MemOperand x86memop, PatFrag ld_frag,
                           RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
                           RegisterClass KRC> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                   [(set DstRC:$dst,
                     (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
   def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
                                                          VR128X:$src),
                     !strconcat(OpcodeStr, 
-                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                    " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                     [(set DstRC:$dst,
                       (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
                     EVEX, EVEX_KZ;
   let mayLoad = 1 in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                   [(set DstRC:$dst, 
                     (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
   def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
                                                          x86memop:$src),
                   !strconcat(OpcodeStr, 
-                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                      " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                   [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, 
                                      (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
   }
@@ -503,7 +513,7 @@ multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
                        RegisterClass DstRC, RegisterClass KRC,
                        ValueType OpVT, ValueType SrcVT> {
 def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
-                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                   []>, EVEX;
 }
 
@@ -522,14 +532,14 @@ multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
                      (ins RC:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set RC:$dst,
                        (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
                      EVEX;
   def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
                      (ins x86memop:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set RC:$dst,
                        (OpVT (OpNode (mem_frag addr:$src1),
                               (i8 imm:$src2))))]>, EVEX;
@@ -548,14 +558,14 @@ multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2),
                    !strconcat(OpcodeStr,
-                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set RC:$dst,
                      (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
 
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2),
                    !strconcat(OpcodeStr,
-                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set RC:$dst,
                      (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
                      EVEX_4V;
@@ -575,97 +585,104 @@ defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem,
 // -- VPERM2I - 3 source operands form --
 multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           PatFrag mem_frag, X86MemOperand x86memop,
-                          ValueType OpVT> {
+                          SDNode OpNode, ValueType OpVT> {
 let Constraints = "$src1 = $dst" in {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
-                       "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                       " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
-                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>,
+                     (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
                     EVEX_4V;
 
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
-                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, 
+                     (OpVT (OpNode RC:$src1, RC:$src2, 
                       (mem_frag addr:$src3))))]>, EVEX_4V;
   }
 }
 defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32, i512mem, 
-                               v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                               X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64, i512mem, 
-                               v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+                               X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32, i512mem, 
-                               v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                               X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64, i512mem, 
-                               v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
+                               X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPERMT2D  : avx512_perm_3src<0x7E, "vpermt2d",  VR512, memopv16i32, i512mem, 
+                               X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q  : avx512_perm_3src<0x7E, "vpermt2q",  VR512, memopv8i64, i512mem, 
+                               X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps",  VR512, memopv16f32, i512mem, 
+                               X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd",  VR512, memopv8f64, i512mem, 
+                               X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
-multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, Intrinsic Int, 
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr,
                           RegisterClass KRC, RegisterClass RC,
                           X86MemOperand x86memop, PatFrag mem_frag,
                           SDNode OpNode, ValueType vt> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-               (ins KRC:$mask, RC:$src1, RC:$src2),
-               !strconcat(OpcodeStr,
-                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-               [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), 
-                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
-  def rr_Int : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-               (ins KRC:$mask, RC:$src1, RC:$src2),
-               !strconcat(OpcodeStr,
-                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-               [(set RC:$dst, (Int KRC:$mask, (vt RC:$src2),
+             (ins KRC:$mask, RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr,
+             " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2),
                  (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
-
-  let mayLoad = 1 in {
-    def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
-                 !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
-                 []>, 
-                 EVEX_4V, EVEX_K;
-
-    def rm_Int : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
-                 !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
-                 [(set RC:$dst, (Int KRC:$mask, (vt RC:$src1),
-                   (mem_frag addr:$src2)))]>,
-                 EVEX_4V, EVEX_K;
-  }
+  let mayLoad = 1 in
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins KRC:$mask, RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr,
+             " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_K;
 }
 
 let ExeDomain = SSEPackedSingle in
 defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", 
-                              int_x86_avx512_mskblend_ps_512,
                               VK16WM, VR512, f512mem,
                               memopv16f32, vselect, v16f32>, 
                               EVEX_CD8<32, CD8VF>, EVEX_V512;
 let ExeDomain = SSEPackedDouble in
 defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", 
-                              int_x86_avx512_mskblend_pd_512,
                               VK8WM, VR512, f512mem,
                               memopv8f64, vselect, v8f64>, 
                               VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
 
+def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1),
+                 (v16f32 VR512:$src2), (i16 GR16:$mask))),
+        (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM),
+         VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1),
+                 (v8f64 VR512:$src2), (i8 GR8:$mask))),
+        (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM),
+         VR512:$src1, VR512:$src2)>;
+
 defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", 
-                              int_x86_avx512_mskblend_d_512,
                               VK16WM, VR512, f512mem, 
                               memopv16i32, vselect, v16i32>, 
                               EVEX_CD8<32, CD8VF>, EVEX_V512;
 
 defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", 
-                              int_x86_avx512_mskblend_q_512, 
                               VK8WM, VR512, f512mem, 
                               memopv8i64, vselect, v8i64>, 
                               VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
 
+def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1),
+                 (v16i32 VR512:$src2), (i16 GR16:$mask))),
+        (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16),
+         VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1),
+                 (v8i64 VR512:$src2), (i8 GR8:$mask))),
+        (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8),
+         VR512:$src1, VR512:$src2)>;
+
 let Predicates = [HasAVX512] in {
 def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
                             (v8f32 VR256X:$src2))),
@@ -681,31 +698,71 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
 }
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string asm, string asm_alt> {
+  def rr : AVX512Ii8<0xC2, MRMSrcReg,
+                (outs VK1:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+                IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+  def rm : AVX512Ii8<0xC2, MRMSrcMem,
+                (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                [(set VK1:$dst, (OpNode (VT RC:$src1),
+                (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : AVX512Ii8<0xC2, MRMSrcReg,
+               (outs VK1:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+    def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem,
+               (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, AVXCC, X86cmpms, f32, loadf32,
+                 "vcmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "vcmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XS;
+defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64,
+                 "vcmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "vcmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XD, VEX_W;
+}
 
 multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
               RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
               SDNode OpNode, ValueType vt> {
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], 
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
   def rm : AVX512BI<opc, MRMSrcMem,
              (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
              IIC_SSE_ALU_F32P_RM>, EVEX_4V;
 }
 
 defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, 
-                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512;
+                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
 defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, 
-                           memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W;
+                           memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512,
+                           VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, 
-                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512;
+                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
 defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, 
-                           memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W;
+                           memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512,
+                           VEX_W, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
             (COPY_TO_REGCLASS (VPCMPGTDZrr 
@@ -730,12 +787,12 @@ multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
              [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
                               imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
   // Accept explicit immediate argument form instead of comparison code.
-  let neverHasSideEffects = 1 in {
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512AIi8<opc, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
                asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
     def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
                asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
   }
 }
@@ -764,36 +821,43 @@ defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64,
 
 // avx512_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
-                           X86MemOperand x86memop, Operand CC,
-                           SDNode OpNode, ValueType vt, string asm,
-                           string asm_alt, Domain d> {
+                           X86MemOperand x86memop, ValueType vt,
+                           string suffix, Domain d> {
   def rri : AVX512PIi8<0xC2, MRMSrcReg,
-             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
+             !strconcat("vcmp${cc}", suffix,
+                        " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (X86cmpm (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+  def rrib: AVX512PIi8<0xC2, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc),
+     !strconcat("vcmp${cc}", suffix,
+                " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
+                [], d>, EVEX_B;
   def rmi : AVX512PIi8<0xC2, MRMSrcMem,
-             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
+              !strconcat("vcmp${cc}", suffix,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
              [(set KRC:$dst,
-              (OpNode (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
+              (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
 
   // Accept explicit immediate argument form instead of comparison code.
-  let neverHasSideEffects = 1 in {
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               asm_alt, [], d>;
+              !strconcat("vcmp", suffix,
+                        " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
     def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               asm_alt, [], d>;
+              !strconcat("vcmp", suffix,
+                        " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
   }
 }
 
-defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, AVXCC, X86cmpm, v16f32,
-               "vcmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               "vcmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, AVXCC, X86cmpm, v8f64,
-               "vcmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               "vcmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, OpSize, EVEX_4V, VEX_W, EVEX_V512,
+defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, v16f32,
+               "ps", SSEPackedSingle>, PS, EVEX_4V, EVEX_V512,
+               EVEX_CD8<32, CD8VF>;
+defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, v8f64,
+               "pd", SSEPackedDouble>, PD, EVEX_4V, VEX_W, EVEX_V512,
                EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
@@ -811,7 +875,31 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
             imm:$cc), VK8)>;
-               
+
+def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1),
+                (v16f32 VR512:$src2), imm:$cc, (i16 -1),
+                 FROUND_NO_EXC)),
+          (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR16)>;
+           
+def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
+                (v8f64 VR512:$src2), imm:$cc, (i8 -1),
+                 FROUND_NO_EXC)),
+          (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR8)>;
+
+def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1),
+                (v16f32 VR512:$src2), imm:$cc, (i16 -1),
+                FROUND_CURRENT)),
+          (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR16)>;
+
+def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
+                (v8f64 VR512:$src2), imm:$cc, (i8 -1),
+                 FROUND_CURRENT)),
+          (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2,
+                             (I8Imm imm:$cc)), GR8)>;
+
 // Mask register copy, including
 // - copy between mask registers
 // - load/store mask registers
@@ -820,35 +908,35 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                          string OpcodeStr, RegisterClass KRC,
                          ValueType vt, X86MemOperand x86memop> {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
     def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
     let mayLoad = 1 in
     def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                [(set KRC:$dst, (vt (load addr:$src)))]>;
     let mayStore = 1 in
     def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
   }
 }
 
 multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
                              string OpcodeStr,
                              RegisterClass KRC, RegisterClass GRC> {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
     def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
     def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
   }
 }
 
 let Predicates = [HasAVX512] in {
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
-               VEX, TB;
+               VEX, PS;
   defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
-               VEX, TB;
+               VEX, PS;
 }
 
 let Predicates = [HasAVX512] in {
@@ -862,8 +950,40 @@ let Predicates = [HasAVX512] in {
   def : Pat<(store (v16i1 VK16:$src), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
 
-  def : Pat<(store (v8i1 VK8:$src), addr:$dst),
-            (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>;
+  def : Pat<(store VK8:$src, addr:$dst),
+            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+  def : Pat<(i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
+
+  def : Pat<(v8i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+
+  def : Pat<(i1 (trunc (i32 GR32:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>;
+
+  def : Pat<(i1 (trunc (i8 GR8:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
+       VK1)>;
+  def : Pat<(i1 (trunc (i16 GR16:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
+       VK1)>;
+            
+  def : Pat<(i32 (zext VK1:$src)),
+            (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+  def : Pat<(i8 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri (KMOVWrk
+                       (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+  def : Pat<(i64 (zext VK1:$src)),
+            (AND64ri8 (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+  def : Pat<(i16 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+              sub_16bit)>;
 }
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
@@ -876,6 +996,12 @@ let Predicates = [HasAVX512] in {
             (EXTRACT_SUBREG
               (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
               sub_8bit)>;
+
+  def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK16:$src, VK1)>;
+  def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK8:$src, VK1)>;
+
 }
 
 // Mask unary operation
@@ -884,18 +1010,27 @@ multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
                          RegisterClass KRC, SDPatternOperator OpNode> {
   let Predicates = [HasAVX512] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                [(set KRC:$dst, (OpNode KRC:$src))]>;
 }
 
 multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr,
                                SDPatternOperator OpNode> {
   defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                          VEX, TB;
+                          VEX, PS;
 }
 
 defm KNOT : avx512_mask_unop_w<0x44, "knot", not>;
 
+multiclass avx512_mask_unop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
+}
+defm : avx512_mask_unop_int<"knot", "KNOT">;
+
 def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
 def : Pat<(xor VK8:$src1,  (v8i1 immAllOnesV)),
           (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
@@ -906,27 +1041,26 @@ def : Pat<(not VK8:$src),
             (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
 
 // Mask binary operation
-// - KADD, KAND, KANDN, KOR, KXNOR, KXOR
+// - KAND, KANDN, KOR, KXNOR, KXOR
 multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
                            RegisterClass KRC, SDPatternOperator OpNode> {
   let Predicates = [HasAVX512] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr,
-                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
 }
 
 multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr,
                              SDPatternOperator OpNode> {
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                           VEX_4V, VEX_L, TB;
+                           VEX_4V, VEX_L, PS;
 }
 
 def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
 def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
 
 let isCommutable = 1 in {
-  defm KADD  : avx512_mask_binop_w<0x4a, "kadd",  add>;
   defm KAND  : avx512_mask_binop_w<0x41, "kand",  and>;
   let isCommutable = 0 in
   defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>;
@@ -935,19 +1069,33 @@ let isCommutable = 1 in {
   defm KXOR  : avx512_mask_binop_w<0x47, "kxor",  xor>;
 }
 
+def : Pat<(xor VK1:$src1, VK1:$src2),
+     (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                                (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+def : Pat<(or VK1:$src1, VK1:$src2),
+     (COPY_TO_REGCLASS (KORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                               (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+def : Pat<(and VK1:$src1, VK1:$src2),
+     (COPY_TO_REGCLASS (KANDWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                                (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
 multiclass avx512_mask_binop_int<string IntName, string InstName> {
   let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
-                VK16:$src1, VK16:$src2),
-              (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>;
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src1), (i16 GR16:$src2)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
 }
 
-defm : avx512_mask_binop_int<"kadd",  "KADD">;
 defm : avx512_mask_binop_int<"kand",  "KAND">;
 defm : avx512_mask_binop_int<"kandn", "KANDN">;
 defm : avx512_mask_binop_int<"kor",   "KOR">;
 defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
 defm : avx512_mask_binop_int<"kxor",  "KXOR">;
+
 // With AVX-512, 8-bit mask is promoted to 16-bit mask.
 multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
   let Predicates = [HasAVX512] in
@@ -965,44 +1113,53 @@ defm : avx512_binop_pat<xor,  KXORWrr>;
 
 // Mask unpacking
 multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
-                           RegisterClass KRC1, RegisterClass KRC2> {
+                           RegisterClass KRC> {
   let Predicates = [HasAVX512] in
-    def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2),
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr,
-                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
 }
 
 multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
-  defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>,
-                            VEX_4V, VEX_L, OpSize, TB;
+  defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>,
+                            VEX_4V, VEX_L, PD;
 }
 
 defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
+def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))),
+          (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16),
+                  (COPY_TO_REGCLASS VK8:$src1, VK16))>;
+
 
 multiclass avx512_mask_unpck_int<string IntName, string InstName> {
   let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
-                VK8:$src1, VK8:$src2),
-              (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>;
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw")
+                (i16 GR16:$src1), (i16 GR16:$src2)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
 }
+defm : avx512_mask_unpck_int<"kunpck",  "KUNPCK">;
 
-defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">;
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
                             SDNode OpNode> {
   let Predicates = [HasAVX512], Defs = [EFLAGS] in
     def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+               !strconcat(OpcodeStr, " \t{$src2, $src1|$src1, $src2}"),
                [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
 }
 
 multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
   defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                            VEX, TB;
+                            VEX, PS;
 }
 
 defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
-defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest>;
+
+def : Pat<(X86cmp VK1:$src1, (i1 0)),
+          (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+           (COPY_TO_REGCLASS VK1:$src1, VK16))>;
 
 // Mask shift
 multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -1010,18 +1167,18 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
   let Predicates = [HasAVX512] in
     def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm),
                  !strconcat(OpcodeStr,
-                            "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+                            " \t{$imm, $src, $dst|$dst, $src, $imm}"),
                             [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
 }
 
 multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
                                SDNode OpNode> {
   defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                             VEX, OpSize, TA, VEX_W;
+                             VEX, TAPD, VEX_W;
 }
 
-defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>;
-defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>;
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
 
 // Mask setting all 0s or 1s
 multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
@@ -1032,7 +1189,7 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
 }
 
 multiclass avx512_mask_setop_w<PatFrag Val> {
-  defm B : avx512_mask_setop<VK8,  v8i1, Val>;
+  defm B : avx512_mask_setop<VK8,   v8i1, Val>;
   defm W : avx512_mask_setop<VK16, v16i1, Val>;
 }
 
@@ -1043,6 +1200,9 @@ defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
 let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
   def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
+  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSET1W), VK1)>;
+  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
           (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
@@ -1053,153 +1213,167 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
 
+def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
+
+def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - Aligned and unaligned load and store
 //
 
-multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+multiclass avx512_load<bits<8> opc, RegisterClass RC, RegisterClass KRC,
                             X86MemOperand x86memop, PatFrag ld_frag, 
-                            string asm, Domain d> {
-let neverHasSideEffects = 1 in
+                            string asm, Domain d,
+                            ValueType vt, bit IsReMaterializable = 1> {
+let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
               EVEX;
-let canFoldAsLoad = 1 in
+  def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
+               !strconcat(asm,
+               " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               [], d>, EVEX, EVEX_KZ;
+  }
+  let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
   def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-               [(set RC:$dst, (ld_frag addr:$src))], d>, EVEX;
-let Constraints = "$src1 = $dst" in {
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+               [(set (vt RC:$dst), (ld_frag addr:$src))], d>, EVEX;
+  let Constraints = "$src1 = $dst",  hasSideEffects = 0 in {
   def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), 
                                      (ins RC:$src1, KRC:$mask, RC:$src2),
               !strconcat(asm, 
-              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
               EVEX, EVEX_K;
+  let mayLoad = 1 in
   def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
                                 (ins RC:$src1, KRC:$mask, x86memop:$src2),
               !strconcat(asm, 
-              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
                [], d>, EVEX, EVEX_K;
-}
+  }
+  let mayLoad = 1 in
+  def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                      (ins KRC:$mask, x86memop:$src2),
+              !strconcat(asm,
+              " \t{$src2, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src2}"),
+               [], d>, EVEX, EVEX_KZ;
 }
 
-defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
-                              "vmovaps", SSEPackedSingle>,
-                               EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
-                              "vmovapd", SSEPackedDouble>,
-                              OpSize, EVEX_V512, VEX_W,
-                              EVEX_CD8<64, CD8VF>;
-defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32,
-                              "vmovups", SSEPackedSingle>,
-                              EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64,
-                              "vmovupd", SSEPackedDouble>,
-                               OpSize, EVEX_V512, VEX_W,
-                               EVEX_CD8<64, CD8VF>;
-def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
-                    "vmovaps\t{$src, $dst|$dst, $src}",
-                    [(alignedstore512 (v16f32 VR512:$src), addr:$dst)],
-                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
-                    "vmovapd\t{$src, $dst|$dst, $src}",
-                    [(alignedstore512 (v8f64 VR512:$src), addr:$dst)],
-                    SSEPackedDouble>, EVEX, EVEX_V512,
-                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
-def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
-                    "vmovups\t{$src, $dst|$dst, $src}",
-                    [(store (v16f32 VR512:$src), addr:$dst)],
-                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
-                    "vmovupd\t{$src, $dst|$dst, $src}",
-                    [(store (v8f64 VR512:$src), addr:$dst)],
-                    SSEPackedDouble>, EVEX, EVEX_V512,
-                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
-
-let neverHasSideEffects = 1 in {
-  def VMOVDQA32rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
-                             (ins VR512:$src),
-                             "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
-                             EVEX, EVEX_V512;
-  def VMOVDQA64rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
-                             (ins VR512:$src),
-                             "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
-                             EVEX, EVEX_V512, VEX_W;
-let mayStore = 1 in {
-  def VMOVDQA32mr  : AVX512BI<0x7F, MRMDestMem, (outs),
-                     (ins i512mem:$dst, VR512:$src),
-                     "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
-                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-  def VMOVDQA64mr  : AVX512BI<0x7F, MRMDestMem, (outs),
-                     (ins i512mem:$dst, VR512:$src),
-                     "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
-                     EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-}
-let mayLoad = 1 in {
-def VMOVDQA32rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), 
-                           (ins i512mem:$src),
-                           "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
-                           EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-def VMOVDQA64rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
-                           (ins i512mem:$src),
-                           "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
-                           EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-}
-}
-
-// 512-bit aligned load/store
-def : Pat<(alignedloadv8i64 addr:$src),  (VMOVDQA64rm addr:$src)>;
-def : Pat<(alignedloadv16i32 addr:$src), (VMOVDQA32rm addr:$src)>;
-
-def : Pat<(alignedstore512 (v8i64  VR512:$src), addr:$dst),
-          (VMOVDQA64mr addr:$dst, VR512:$src)>;
-def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst),
-          (VMOVDQA32mr addr:$dst, VR512:$src)>;
-
-multiclass avx512_mov_int<bits<8> load_opc, bits<8> store_opc, string asm,
-                          RegisterClass RC, RegisterClass KRC,
-                          PatFrag ld_frag, X86MemOperand x86memop> {
-let neverHasSideEffects = 1 in
-  def rr : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
-let canFoldAsLoad = 1 in
-  def rm : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                     [(set RC:$dst, (ld_frag addr:$src))]>, EVEX;
-let mayStore = 1 in
-  def mr : AVX512XSI<store_opc, MRMDestMem, (outs),
-                     (ins x86memop:$dst, VR512:$src),
-                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
-let Constraints = "$src1 = $dst" in {
-  def rrk : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst),
-                                      (ins RC:$src1, KRC:$mask, RC:$src2),
-              !strconcat(asm, 
-              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>,
+multiclass avx512_store<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+                            X86MemOperand x86memop, PatFrag store_frag,
+                            string asm, Domain d, ValueType vt> {
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+  def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
+              EVEX;
+  let Constraints = "$src1 = $dst" in
+  def alt_rrk : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
+                                     (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm,
+              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
               EVEX, EVEX_K;
-  def rmk : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst),
-                                  (ins RC:$src1, KRC:$mask, x86memop:$src2),
-              !strconcat(asm, 
-              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-               []>, EVEX, EVEX_K;
-}
+  def alt_rrkz : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
+                                           (ins KRC:$mask, RC:$src),
+              !strconcat(asm,
+              " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+              [], d>, EVEX, EVEX_KZ;
+  }
+  let mayStore = 1 in {
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
+               [(store_frag (vt RC:$src), addr:$dst)], d>, EVEX;
+  def mrk : AVX512PI<opc, MRMDestMem, (outs),
+                                (ins x86memop:$dst, KRC:$mask, RC:$src),
+              !strconcat(asm,
+              " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+               [], d>, EVEX, EVEX_K;
+  def mrkz : AVX512PI<opc, MRMDestMem, (outs),
+                      (ins x86memop:$dst, KRC:$mask, RC:$src),
+              !strconcat(asm,
+              " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               [], d>, EVEX, EVEX_KZ;
+  }
 }
 
-defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM,
-                                memopv16i32, i512mem>,
-                                EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVDQU64 : avx512_mov_int<0x6F, 0x7F, "vmovdqu64", VR512, VK8WM,
-                                memopv8i64, i512mem>,
-                                EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-// 512-bit unaligned load/store
-def : Pat<(loadv8i64 addr:$src),         (VMOVDQU64rm addr:$src)>;
-def : Pat<(loadv16i32 addr:$src),        (VMOVDQU32rm addr:$src)>;
-
-def : Pat<(store (v8i64  VR512:$src), addr:$dst),
-          (VMOVDQU64mr addr:$dst, VR512:$src)>;
-def : Pat<(store (v16i32 VR512:$src), addr:$dst),
-          (VMOVDQU32mr addr:$dst, VR512:$src)>;
+defm VMOVAPSZ : avx512_load<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
+                              "vmovaps", SSEPackedSingle, v16f32>,
+                avx512_store<0x29, VR512, VK16WM, f512mem, alignedstore512,
+                              "vmovaps", SSEPackedSingle, v16f32>,
+                               PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVAPDZ : avx512_load<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
+                              "vmovapd", SSEPackedDouble, v8f64>,
+                avx512_store<0x29, VR512, VK8WM, f512mem, alignedstore512,
+                              "vmovapd", SSEPackedDouble, v8f64>,
+                              PD, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+defm VMOVUPSZ : avx512_load<0x10, VR512, VK16WM, f512mem, loadv16f32,
+                              "vmovups", SSEPackedSingle, v16f32>,
+                avx512_store<0x11, VR512, VK16WM, f512mem, store,
+                              "vmovups", SSEPackedSingle, v16f32>,
+                              PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVUPDZ : avx512_load<0x10, VR512, VK8WM, f512mem, loadv8f64,
+                              "vmovupd", SSEPackedDouble, v8f64, 0>,
+                avx512_store<0x11, VR512, VK8WM, f512mem, store,
+                              "vmovupd", SSEPackedDouble, v8f64>,
+                               PD, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
+                 (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+       (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
+          GR16:$mask),
+         (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
+          GR8:$mask),
+         (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+defm VMOVDQA32: avx512_load<0x6F, VR512, VK16WM, i512mem, alignedloadv16i32,
+                              "vmovdqa32", SSEPackedInt, v16i32>,
+                avx512_store<0x7F, VR512, VK16WM, i512mem, alignedstore512,
+                              "vmovdqa32", SSEPackedInt, v16i32>,
+                               PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQA64: avx512_load<0x6F, VR512, VK8WM, i512mem, alignedloadv8i64,
+                              "vmovdqa64", SSEPackedInt, v8i64>,
+                avx512_store<0x7F, VR512, VK8WM, i512mem, alignedstore512,
+                              "vmovdqa64", SSEPackedInt, v8i64>,
+                               PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VMOVDQU32: avx512_load<0x6F, VR512, VK16WM, i512mem, load,
+                              "vmovdqu32", SSEPackedInt, v16i32>,
+                avx512_store<0x7F, VR512, VK16WM, i512mem, store,
+                              "vmovdqu32", SSEPackedInt, v16i32>,
+                               XS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load,
+                              "vmovdqu64", SSEPackedInt, v8i64>,
+                avx512_store<0x7F, VR512, VK8WM, i512mem, store,
+                              "vmovdqu64", SSEPackedInt, v8i64>,
+                               XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 let AddedComplexity = 20 in {
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
+                           (bc_v8i64 (v16i32 immAllZerosV)))),
+                  (VMOVDQU64rrkz VK8WM:$mask, VR512:$src)>;
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+                  (v8i64 VR512:$src))),
+   (VMOVDQU64rrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+                                              VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
+                           (v16i32 immAllZerosV))),
+                  (VMOVDQU32rrkz VK16WM:$mask, VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+                   (v16i32 VR512:$src))),
+   (VMOVDQU32rrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+                                              
 def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), 
                            (v16f32 VR512:$src2))),
                   (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
@@ -1215,33 +1389,33 @@ def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1),
 }
 // Move Int Doubleword to Packed Double Int
 //
-def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
-                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                         EVEX, VEX_LIG;
-def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
-                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
-                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
                         [(set VR128X:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
 let isCodeGenOnly = 1 in {
-def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "vmovq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
-def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
                          IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
 }
-def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
-                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
                          IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
                          EVEX_CD8<64, CD8VT1>;
@@ -1249,68 +1423,68 @@ def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$s
 // Move Int Doubleword to Single Scalar
 //
 let isCodeGenOnly = 1 in {
-def VMOVDI2SSZrr  : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
-                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(set FR32X:$dst, (bitconvert GR32:$src))],
                       IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
 
-def VMOVDI2SSZrm  : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
-                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
                       IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 }
 
-// Move Packed Doubleword Int to Packed Double Int
+// Move doubleword from xmm register to r/m32
 //
-def VMOVPDI2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
-                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
                        EVEX, VEX_LIG;
-def VMOVPDI2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128X:$src),
-                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       "vmovd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128X:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
                        EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 
-// Move Packed Doubleword Int first element to Doubleword Int
+// Move quadword from xmm1 register to r/m64
 //
 def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
-                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      "vmovq\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
                                                    (iPTR 0)))],
-                      IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W,
+                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W,
                       Requires<[HasAVX512, In64BitMode]>;
 
 def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
                        (ins i64mem:$dst, VR128X:$src),
-                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       "vmovq\t{$src, $dst|$dst, $src}",
                        [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
                                addr:$dst)], IIC_SSE_MOVDQ>,
-                       EVEX, OpSize, VEX_LIG, VEX_W, TB, EVEX_CD8<64, CD8VT1>,
+                       EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>,
                        Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
 
 // Move Scalar Single to Double Int
 //
 let isCodeGenOnly = 1 in {
-def VMOVSS2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst),
+def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
                       (ins FR32X:$src),
-                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32X:$src))],
                       IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
-def VMOVSS2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                       (ins i32mem:$dst, FR32X:$src),
-                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
                       IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 }
 
 // Move Quadword Int to Packed Quadword Int
 //
-def VMOVQI2PQIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
                       (ins i64mem:$src),
-                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      "vmovq\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
                       EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -1322,40 +1496,55 @@ def VMOVQI2PQIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst),
 multiclass avx512_move_scalar <string asm, RegisterClass RC, 
                               SDNode OpNode, ValueType vt,
                               X86MemOperand x86memop, PatFrag mem_pat> {
+  let hasSideEffects = 0 in {
   def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), 
-              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
               [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
                                       (scalar_to_vector RC:$src2))))],
               IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
+  let Constraints = "$src1 = $dst" in
+  def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
+              (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
+              !strconcat(asm,
+                " \t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
+              [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
   def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
               [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
               EVEX, VEX_LIG;
   def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
-             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
              [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
              EVEX, VEX_LIG;
+  } //hasSideEffects = 0
 }
 
 let ExeDomain = SSEPackedSingle in
-defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem,
+defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
                                  loadf32>, XS, EVEX_CD8<32, CD8VT1>;
 
 let ExeDomain = SSEPackedDouble in
-defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem,
+defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
                                  loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
+def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+
+def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
 
 // For the disassembler
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
                         (ins VR128X:$src1, FR32X:$src2),
-                        "movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                         IIC_SSE_MOV_S_RR>,
                         XS, EVEX_4V, VEX_LIG;
   def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
                         (ins VR128X:$src1, FR64X:$src2),
-                        "movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                         IIC_SSE_MOV_S_RR>,
                         XD, EVEX_4V, VEX_LIG, VEX_W;
 }
@@ -1504,7 +1693,7 @@ let Predicates = [HasAVX512] in {
 let AddedComplexity = 15 in
 def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
                                 (ins VR128X:$src),
-                                "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                "vmovq\t{$src, $dst|$dst, $src}",
                                 [(set VR128X:$dst, (v2i64 (X86vzmovl 
                                                    (v2i64 VR128X:$src))))],
                                 IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
@@ -1512,7 +1701,7 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
                                  (ins i128mem:$src),
-                                 "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                 "vmovq\t{$src, $dst|$dst, $src}",
                                  [(set VR128X:$dst, (v2i64 (X86vzmovl
                                                      (loadv2i64 addr:$src))))],
                                  IIC_SSE_MOVDQ>, EVEX, VEX_W,
@@ -1536,6 +1725,8 @@ let Predicates = [HasAVX512] in {
             (VMOVZPQILo2PQIZrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
             (VMOVZPQILo2PQIZrr VR128X:$src)>;
+    def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
   }
 
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
@@ -1563,104 +1754,251 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
 // AVX-512 - Integer arithmetic
 //
 multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        ValueType OpVT, RegisterClass KRC,
+                        RegisterClass RC, PatFrag memop_frag,
                         X86MemOperand x86memop, PatFrag scalar_mfrag,
                         X86MemOperand x86scalar_mop, string BrdcstStr,
                         OpndItins itins, bit IsCommutable = 0> {
   let isCommutable = IsCommutable in
-  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-       (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
-       itins.rr>, EVEX_4V;
-  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
-                                     itins.rm>, EVEX_4V;
-  def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, x86scalar_mop:$src2),
-       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
-                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
-       [(set RC:$dst, (OpNode RC:$src1, 
-                       (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
-                        itins.rm>, EVEX_4V, EVEX_B;
-}
-multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
-                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
-                         PatFrag memop_frag, X86MemOperand x86memop,
-                         OpndItins itins,
-                         bit IsCommutable = 0> {
+    def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+              (ins RC:$src1, RC:$src2),
+              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
+              itins.rr>, EVEX_4V;
+  let AddedComplexity = 30 in {
+    let Constraints = "$src0 = $dst" in
+      def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+                 (ins RC:$src0, KRC:$mask, RC:$src1, RC:$src2),
+                 !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                  (OpNode (OpVT RC:$src1), (OpVT RC:$src2)),
+                                  RC:$src0)))],
+                 itins.rr>, EVEX_4V, EVEX_K;
+    def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, RC:$src2),
+                !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" ,
+                    "|$dst {${mask}} {z}, $src1, $src2}"),
+                [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                  (OpNode (OpVT RC:$src1), (OpVT RC:$src2)),
+                                  (OpVT immAllZerosV))))],
+                itins.rr>, EVEX_4V, EVEX_KZ;
+  }
+
+  let mayLoad = 1 in {
+    def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+              (ins RC:$src1, x86memop:$src2),
+              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
+              itins.rm>, EVEX_4V;
+    let AddedComplexity = 30 in {
+    let Constraints = "$src0 = $dst" in
+      def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins RC:$src0, KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                     " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)),
+                                    RC:$src0)))],
+                 itins.rm>, EVEX_4V, EVEX_K;
+    def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+                [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)),
+                                    (OpVT immAllZerosV))))],
+                itins.rm>, EVEX_4V, EVEX_KZ;
+    }
+    def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86scalar_mop:$src2),
+               !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                          ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+               [(set RC:$dst, (OpNode RC:$src1,
+                               (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+               itins.rm>, EVEX_4V, EVEX_B;
+    let AddedComplexity = 30 in {
+    let Constraints = "$src0 = $dst" in
+      def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src0, KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                  !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                             ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}",
+                             BrdcstStr, "}"),
+                  [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1),
+                                     (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))),
+                                    RC:$src0)))],
+                  itins.rm>, EVEX_4V, EVEX_B, EVEX_K;
+    def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                 !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                            ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
+                            BrdcstStr, "}"),
+                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                    (OpNode (OpVT RC:$src1),
+                                     (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))),
+                                    (OpVT immAllZerosV))))],
+                 itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ;
+    }
+  }
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT,
+                            ValueType SrcVT, RegisterClass KRC, RegisterClass RC,
+                            PatFrag memop_frag, X86MemOperand x86memop,
+                            PatFrag scalar_mfrag, X86MemOperand x86scalar_mop,
+                            string BrdcstStr, OpndItins itins, bit IsCommutable = 0> {
   let isCommutable = IsCommutable in
-  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+  {
+    def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       []>, EVEX_4V, VEX_W;
-  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       []>, EVEX_4V, VEX_W;
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V;
+    def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                  " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+               [], itins.rr>, EVEX_4V, EVEX_K;
+    def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, RC:$src2),
+                !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" ,
+                    "|$dst {${mask}} {z}, $src1, $src2}"),
+                [], itins.rr>, EVEX_4V, EVEX_KZ;
+  }
+  let mayLoad = 1 in {
+    def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+              (ins RC:$src1, x86memop:$src2),
+              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              []>, EVEX_4V;
+    def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, x86memop:$src2),
+               !strconcat(OpcodeStr,
+                   " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+               [], itins.rm>, EVEX_4V, EVEX_K;
+    def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+                [], itins.rm>, EVEX_4V, EVEX_KZ;
+    def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86scalar_mop:$src2),
+               !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                          ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+               [], itins.rm>, EVEX_4V, EVEX_B;
+    def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                           ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}",
+                           BrdcstStr, "}"),
+                [], itins.rm>, EVEX_4V, EVEX_B, EVEX_K;
+    def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
+                 !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+                            ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
+                            BrdcstStr, "}"),
+                 [], itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ;
+  }
 }
 
-defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
-                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
-defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>,
-                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
-defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
-                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
-defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64,
-                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, 
-                   EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
+defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
 
-defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64,
-                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
-                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32,
-                VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8,
-                EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512,
+                   EVEX_CD8<64, CD8VF>, VEX_W;
 
-defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32,
-                 VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512,
-                 EVEX_CD8<64, CD8VF>;
+defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTMUL_ITINS_P, 1>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
 
 def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
           (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
 
-defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
-                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VR512, memopv8i64,
-                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
-                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
-                   EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VR512, memopv8i64,
-                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
-                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
-                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VR512, memopv8i64,
-                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
-                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VR512, memopv16i32,
-                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
-                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VR512, memopv8i64,
-                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
-                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
+def : Pat<(v8i64 (int_x86_avx512_mask_pmulu_dq_512 (v16i32 VR512:$src1),
+           (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1),
+           (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VPMULDQZrr VR512:$src1, VR512:$src2)>;
+
+defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512,
+                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                   SSE_INTALU_ITINS_P, 1>,
+                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512,
+                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                   SSE_INTALU_ITINS_P, 0>,
+                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMAXSDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v16i32 (int_x86_avx512_mask_pmaxu_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMAXUDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pmaxs_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMAXSQZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pmaxu_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMAXUQZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v16i32 (int_x86_avx512_mask_pmins_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMINSDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v16i32 (int_x86_avx512_mask_pminu_d_512 (v16i32 VR512:$src1),
+                    (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
+           (VPMINUDZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pmins_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMINSQZrr VR512:$src1, VR512:$src2)>;
+def : Pat <(v8i64 (int_x86_avx512_mask_pminu_q_512 (v8i64 VR512:$src1),
+                (v8i64 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VPMINUQZrr VR512:$src1, VR512:$src2)>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - Unpack Instructions
 //===----------------------------------------------------------------------===//
@@ -1684,28 +2022,28 @@ multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
 
 defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64,
       VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+      SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64,
       VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+      SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64,
       VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+      SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64,
       VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+      SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
 multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
                         X86MemOperand x86memop> {
   def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
        IIC_SSE_UNPCK>, EVEX_4V;
   def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
                                      (bitconvert (memop_frag addr:$src2)))))],
                                      IIC_SSE_UNPCK>, EVEX_4V;
@@ -1732,29 +2070,29 @@ multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
                      (ins RC:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set RC:$dst,
                        (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
                      EVEX;
   def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
                      (ins x86memop:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set RC:$dst,
                        (OpVT (OpNode (mem_frag addr:$src1),
                               (i8 imm:$src2))))]>, EVEX;
 }
 
 defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
-                      i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                      i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
 let ExeDomain = SSEPackedSingle in
 defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
-                      memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
+                      memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512,
                       EVEX_CD8<32, CD8VF>;
 let ExeDomain = SSEPackedDouble in
 defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
-                      memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
+                      memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512,
                       VEX_W, EVEX_CD8<32, CD8VF>;
 
 def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
@@ -1766,30 +2104,30 @@ def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
 // AVX-512  Logical Instructions
 //===----------------------------------------------------------------------===//
 
-defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32,
+defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32,
                       i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
                       EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64,
+defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64,
                       i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
                       EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32,
+defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32,
                       i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
                       EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64,
+defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64,
                       i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
                       EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32,
+defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32,
                       i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
                       EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64,
+defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64,
                       i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
                       EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512,
+defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512,
                       memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
                       SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64,
-                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>,
-                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512,
+                      memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                      SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
@@ -1797,10 +2135,10 @@ defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8
 
 multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   SizeItins itins> {
-  defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss{z}"), OpNode, FR32X,
+  defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32X,
                              f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG,
                              EVEX_CD8<32, CD8VT1>;
-  defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd{z}"), OpNode, FR64X,
+  defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), OpNode, FR64X,
                              f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG,
                              EVEX_CD8<64, CD8VT1>;
 }
@@ -1817,82 +2155,138 @@ defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>;
 }
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass KRC,
                            RegisterClass RC, ValueType vt,
                            X86MemOperand x86memop, PatFrag mem_frag,
                            X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
                            string BrdcstStr,
                            Domain d, OpndItins itins, bit commutable> {
-  let isCommutable = commutable in
+  let isCommutable = commutable in {
     def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
-       EVEX_4V, TB;
+       EVEX_4V;
+
+    def rrk: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr,
+           " \t{$src2, $src1, $dst {${mask}} |$dst {${mask}}, $src1, $src2}"),
+       [], itins.rr, d>, EVEX_4V, EVEX_K;
+
+    def rrkz: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr,
+           " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+       [], itins.rr, d>, EVEX_4V, EVEX_KZ;
+  }
+
   let mayLoad = 1 in {
     def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
-          itins.rm, d>, EVEX_4V, TB;
+          itins.rm, d>, EVEX_4V;
+
     def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86scalar_mop:$src2),
-       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
-                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
+           ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
        [(set RC:$dst, (OpNode RC:$src1, 
                        (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
-       itins.rm, d>, EVEX_4V, EVEX_B, TB;
-    }
+       itins.rm, d>, EVEX_4V, EVEX_B;
+
+    def rmk : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr,
+           "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_K;
+
+    def rmkz : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr,
+           "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_KZ;
+
+    def rmbk : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr,
+           " \t{${src2}", BrdcstStr,
+           ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_K;
+
+    def rmbkz : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr,
+           " \t{${src2}", BrdcstStr,
+           ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
+           BrdcstStr, "}"),
+       [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_KZ;
+  }
 }
 
-defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem,
+defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem,
                    memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, 
-                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
                    
-defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem,
+defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem,
                    memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
                    SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem,
+defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem,
                    memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem,
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem,
                    memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
                    SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem,
+defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem,
                    memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
                    SSE_ALU_ITINS_P.s, 1>,
-                   EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem,
+                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem,
                    memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
                    SSE_ALU_ITINS_P.s, 1>,
-                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
 
-defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem,
+defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem,
                    memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
                    SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem,
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem,
                    memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
                    SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem,
+defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem,
                    memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem,
                    memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
 
-defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem, 
+defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem,
                    memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
                    SSE_ALU_ITINS_P.d, 0>, 
-                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, 
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem,
                    memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
                    SSE_ALU_ITINS_P.d, 0>, 
-                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
-
+                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1),
+                   (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
+                   (i16 -1), FROUND_CURRENT)),
+          (VMAXPSZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1),
+                   (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
+                   (i8 -1), FROUND_CURRENT)),
+          (VMAXPDZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1),
+                   (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
+                   (i16 -1), FROUND_CURRENT)),
+          (VMINPSZrr VR512:$src1, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1),
+                   (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)),
+                   (i8 -1), FROUND_CURRENT)),
+          (VMINPDZrr VR512:$src1, VR512:$src2)>;
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
 //===----------------------------------------------------------------------===//
@@ -1900,24 +2294,41 @@ defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem,
 multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
               RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
               SDNode OpNode, ValueType vt> {
-  def rr : AVX5128I<opc, MRMSrcReg,
+  def rr : AVX512PI<opc, MRMSrcReg,
              (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V;
-  def rm : AVX5128I<opc, MRMSrcMem,
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))],
+             SSEPackedInt>, EVEX_4V;
+  def rm : AVX512PI<opc, MRMSrcMem,
              (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (OpNode (vt RC:$src1), 
-              (bitconvert (memop_frag addr:$src2))))]>, EVEX_4V;
+              (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V;
 }
 
 defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
-                              memopv16i32, X86testm, v16i32>, EVEX_V512,
+                              memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512,
                               EVEX_CD8<32, CD8VF>;
 defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
-                              memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
+                              memopv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W,
                               EVEX_CD8<64, CD8VF>;
 
+let Predicates = [HasCDI] in {
+defm VPTESTNMDZ  : avx512_vptest<0x27, "vptestnmd", VK16, VR512,  f512mem,
+                              memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512,
+                              EVEX_CD8<32, CD8VF>;
+defm VPTESTNMQZ  : avx512_vptest<0x27, "vptestnmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+}
+
+def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1),
+                 (v16i32 VR512:$src2), (i16 -1))),
+                 (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>;
+
+def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1),
+                 (v8i64 VR512:$src2), (i8 -1))),
+                 (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>;
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
 //===----------------------------------------------------------------------===//
@@ -1927,23 +2338,23 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
                          RegisterClass KRC> {
   def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
        (ins RC:$src1, i8imm:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))],
         SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
   def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
        (ins KRC:$mask, RC:$src1, i8imm:$src2),
            !strconcat(OpcodeStr,
-                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
        [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
   def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
        (ins x86memop:$src1, i8imm:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (OpNode (mem_frag addr:$src1),
                      (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
   def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
        (ins KRC:$mask, x86memop:$src1, i8imm:$src2),
            !strconcat(OpcodeStr,
-                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
        [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
 }
 
@@ -1953,24 +2364,24 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   // src2 is always 128-bit
   def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, VR128X:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
         SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
   def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
        (ins KRC:$mask, RC:$src1, VR128X:$src2),
            !strconcat(OpcodeStr,
-                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
        [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
   def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, i128mem:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set RC:$dst, (vt (OpNode RC:$src1,
                        (bc_frag (memopv2i64 addr:$src2)))))],
                         SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
   def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
        (ins KRC:$mask, RC:$src1, i128mem:$src2),
            !strconcat(OpcodeStr,
-                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
        [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
 }
 
@@ -2024,13 +2435,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86MemOperand x86memop, PatFrag mem_frag> {
   def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, RC:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
                (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
              EVEX_4V;
   def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
              (ins RC:$src1, x86memop:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
                (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
              EVEX_4V;
@@ -2062,10 +2473,10 @@ defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64,
 multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
                         X86MemOperand x86memop, PatFrag memop_frag> {
 def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                     [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
 def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                     [(set RC:$dst,
                       (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
 }
@@ -2082,11 +2493,11 @@ multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
                               ValueType vt, RegisterClass RC, PatFrag mem_frag,
                               X86MemOperand x86memop> {
   def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                       [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
   let mayLoad = 1 in
   def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                       [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
 }
 
@@ -2109,12 +2520,12 @@ def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))),
 //===----------------------------------------------------------------------===//
 def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
-          "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
            IIC_SSE_MOV_LH>, EVEX_4V;
 def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
-          "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
           IIC_SSE_MOV_LH>, EVEX_4V;
 
@@ -2140,18 +2551,18 @@ multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr,
             string BrdcstStr, SDNode OpNode, ValueType OpVT> {
   def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
           (ins RC:$src1, RC:$src2, RC:$src3),
-          !strconcat(OpcodeStr,"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          !strconcat(OpcodeStr," \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
           [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>;
 
   let mayLoad = 1 in
   def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
           (ins RC:$src1, RC:$src2, x86memop:$src3),
-          !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
           [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
                                                (mem_frag addr:$src3))))]>;
    def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, RC:$src2, x86scalar_mop:$src3),
-           !strconcat(OpcodeStr, "\t{${src3}", BrdcstStr, 
+           !strconcat(OpcodeStr, " \t{${src3}", BrdcstStr,
             ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"),
            [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
            (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B;
@@ -2219,11 +2630,11 @@ multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr,
   let mayLoad = 1 in
   def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
           (ins RC:$src1, RC:$src3, x86memop:$src2),
-          !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"),
+          !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"),
           [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>;
    def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, RC:$src3, x86scalar_mop:$src2),
-           !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, 
+           !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
             ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"),
            [(set RC:$dst, (OpNode RC:$src1, 
            (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B;
@@ -2294,14 +2705,14 @@ multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   def r     : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                              " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
   let mayLoad = 1 in
   def m     : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                              " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1,
                             (mem_frag addr:$src3))))]>;
@@ -2309,21 +2720,21 @@ multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 } // Constraints = "$src1 = $dst"
 
-defm VFMADDSSZ  : avx512_fma3s_rm<0xA9, "vfmadd213ss{z}", X86Fmadd, FR32X, 
+defm VFMADDSSZ  : avx512_fma3s_rm<0xA9, "vfmadd213ss", X86Fmadd, FR32X,
                       f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
-defm VFMADDSDZ  : avx512_fma3s_rm<0xA9, "vfmadd213sd{z}", X86Fmadd, FR64X,
+defm VFMADDSDZ  : avx512_fma3s_rm<0xA9, "vfmadd213sd", X86Fmadd, FR64X,
                       f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VFMSUBSSZ  : avx512_fma3s_rm<0xAB, "vfmsub213ss{z}", X86Fmsub, FR32X, 
+defm VFMSUBSSZ  : avx512_fma3s_rm<0xAB, "vfmsub213ss", X86Fmsub, FR32X,
                       f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
-defm VFMSUBSDZ  : avx512_fma3s_rm<0xAB, "vfmsub213sd{z}", X86Fmsub, FR64X,
+defm VFMSUBSDZ  : avx512_fma3s_rm<0xAB, "vfmsub213sd", X86Fmsub, FR64X,
                       f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VFNMADDSSZ  : avx512_fma3s_rm<0xAD, "vfnmadd213ss{z}", X86Fnmadd, FR32X, 
+defm VFNMADDSSZ  : avx512_fma3s_rm<0xAD, "vfnmadd213ss", X86Fnmadd, FR32X,
                       f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
-defm VFNMADDSDZ  : avx512_fma3s_rm<0xAD, "vfnmadd213sd{z}", X86Fnmadd, FR64X,
+defm VFNMADDSDZ  : avx512_fma3s_rm<0xAD, "vfnmadd213sd", X86Fnmadd, FR64X,
                       f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VFNMSUBSSZ  : avx512_fma3s_rm<0xAF, "vfnmsub213ss{z}", X86Fnmsub, FR32X, 
+defm VFNMSUBSSZ  : avx512_fma3s_rm<0xAF, "vfnmsub213ss", X86Fnmsub, FR32X,
                       f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
-defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd{z}", X86Fnmsub, FR64X,
+defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd", X86Fnmsub, FR64X,
                       f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 //===----------------------------------------------------------------------===//
@@ -2332,25 +2743,25 @@ defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd{z}", X86Fnmsub, FR64X,
 
 multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
-              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
               EVEX_4V;
   let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins DstRC:$src1, x86memop:$src),
-              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
               EVEX_4V;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 }
 let Predicates = [HasAVX512] in {
-defm VCVTSI2SSZ   : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}{z}">,
+defm VCVTSI2SSZ   : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}">,
                                   XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}{z}">,
+defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}">,
                                   XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
-defm VCVTSI2SDZ   : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}{z}">,
+defm VCVTSI2SDZ   : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}">,
                                   XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}{z}">,
+defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}">,
                                   XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 
 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
@@ -2371,13 +2782,13 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
 def : Pat<(f64 (sint_to_fp GR64:$src)),
           (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
 
-defm VCVTUSI2SSZ   : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}{z}">,
+defm VCVTUSI2SSZ   : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}">,
                                   XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}{z}">,
+defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}">,
                                   XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}{z}">,
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}">,
                                   XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}{z}">,
+defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}">,
                                   XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 
 def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
@@ -2405,161 +2816,167 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
 multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
                           string asm> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG,
+              Requires<[HasAVX512]>;
   let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
-} // neverHasSideEffects = 1
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG,
+              Requires<[HasAVX512]>;
+} // hasSideEffects = 0
 }
 let Predicates = [HasAVX512] in {
 // Convert float/double to signed/unsigned int 32/64
 defm VCVTSS2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
-                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   ssmem, sse_load_f32, "cvtss2si">,
                                    XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
-                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   ssmem, sse_load_f32, "cvtss2si">,
                                    XS, VEX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
-                                   ssmem, sse_load_f32, "cvtss2usi{z}">,
+                                   ssmem, sse_load_f32, "cvtss2usi">,
                                    XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
                                    int_x86_avx512_cvtss2usi64, ssmem,
-                                   sse_load_f32, "cvtss2usi{z}">, XS, VEX_W,
+                                   sse_load_f32, "cvtss2usi">, XS, VEX_W,
                                    EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
-                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   sdmem, sse_load_f64, "cvtsd2si">,
                                    XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
-                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   sdmem, sse_load_f64, "cvtsd2si">,
                                    XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
-                                   sdmem, sse_load_f64, "cvtsd2usi{z}">,
+                                   sdmem, sse_load_f64, "cvtsd2usi">,
                                    XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
                                    int_x86_avx512_cvtsd2usi64, sdmem,
-                                   sse_load_f64, "cvtsd2usi{z}">, XD, VEX_W,
+                                   sse_load_f64, "cvtsd2usi">, XD, VEX_W,
                                    EVEX_CD8<64, CD8VT1>;
 
-defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
-          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}{z}",
-          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
-defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
-          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}{z}",
-          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
-defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
-          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}{z}",
-          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
-defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
-          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}{z}",
-          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
-
-defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
-          int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}{z}",
-          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
-defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
-          int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}{z}",
-          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
-defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
-          int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}{z}",
-          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
-defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
-          int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}{z}",
-          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+let isCodeGenOnly = 1 in {
+  defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+  defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+  defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+  defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+  defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+  defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+  defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+  defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+} // isCodeGenOnly = 1
 
 // Convert float/double to signed/unsigned int 32/64 with truncation
-defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
-                                   ssmem, sse_load_f32, "cvttss2si{z}">,
-                                   XS, EVEX_CD8<32, CD8VT1>;
-defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
-                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
-                                   "cvttss2si{z}">, XS, VEX_W,
-                                   EVEX_CD8<32, CD8VT1>;
-defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
-                                   sdmem, sse_load_f64, "cvttsd2si{z}">, XD,
-                                   EVEX_CD8<64, CD8VT1>;
-defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
-                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
-                                   "cvttsd2si{z}">, XD, VEX_W,
-                                   EVEX_CD8<64, CD8VT1>;
-defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
-                                   int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
-                                   "cvttss2si{z}">, XS, EVEX_CD8<32, CD8VT1>;
-defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
-                                   int_x86_avx512_cvttss2usi64, ssmem,
-                                   sse_load_f32, "cvttss2usi{z}">, XS, VEX_W,
-                                   EVEX_CD8<32, CD8VT1>;
-defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
-                                   int_x86_avx512_cvttsd2usi,
-                                   sdmem, sse_load_f64, "cvttsd2usi{z}">, XD,
-                                   EVEX_CD8<64, CD8VT1>;
-defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
-                                   int_x86_avx512_cvttsd2usi64, sdmem,
-                                   sse_load_f64, "cvttsd2usi{z}">, XD, VEX_W,
-                                   EVEX_CD8<64, CD8VT1>;
-}
+let isCodeGenOnly = 1 in {
+  defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
+                                     ssmem, sse_load_f32, "cvttss2si">,
+                                     XS, EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                     int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                     "cvttss2si">, XS, VEX_W,
+                                     EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
+                                     sdmem, sse_load_f64, "cvttsd2si">, XD,
+                                     EVEX_CD8<64, CD8VT1>;
+  defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                     int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                     "cvttsd2si">, XD, VEX_W,
+                                     EVEX_CD8<64, CD8VT1>;
+  defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                     int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
+                                     "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                     int_x86_avx512_cvttss2usi64, ssmem,
+                                     sse_load_f32, "cvttss2usi">, XS, VEX_W,
+                                     EVEX_CD8<32, CD8VT1>;
+  defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                     int_x86_avx512_cvttsd2usi,
+                                     sdmem, sse_load_f64, "cvttsd2usi">, XD,
+                                     EVEX_CD8<64, CD8VT1>;
+  defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                     int_x86_avx512_cvttsd2usi64, sdmem,
+                                     sse_load_f64, "cvttsd2usi">, XD, VEX_W,
+                                     EVEX_CD8<64, CD8VT1>;
+} // isCodeGenOnly = 1
 
 multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
 }
 
 defm VCVTTSS2SIZ    : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
-                                  loadf32, "cvttss2si{z}">, XS,
+                                  loadf32, "cvttss2si">, XS,
                                   EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2USIZ   : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
-                                  loadf32, "cvttss2usi{z}">, XS,
+                                  loadf32, "cvttss2usi">, XS,
                                   EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2SI64Z  : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
-                                  loadf32, "cvttss2si{z}">, XS, VEX_W,
+                                  loadf32, "cvttss2si">, XS, VEX_W,
                                   EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
-                                  loadf32, "cvttss2usi{z}">, XS, VEX_W,
+                                  loadf32, "cvttss2usi">, XS, VEX_W,
                                   EVEX_CD8<32, CD8VT1>;
 defm VCVTTSD2SIZ    : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
-                                  loadf64, "cvttsd2si{z}">, XD,
+                                  loadf64, "cvttsd2si">, XD,
                                   EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2USIZ   : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
-                                  loadf64, "cvttsd2usi{z}">, XD,
+                                  loadf64, "cvttsd2usi">, XD,
                                   EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2SI64Z  : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
-                                  loadf64, "cvttsd2si{z}">, XD, VEX_W,
+                                  loadf64, "cvttsd2si">, XD, VEX_W,
                                   EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
-                                  loadf64, "cvttsd2usi{z}">, XD, VEX_W,
+                                  loadf64, "cvttsd2usi">, XD, VEX_W,
                                   EVEX_CD8<64, CD8VT1>;
+} // HasAVX512
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
 //===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
                     (ins FR32X:$src1, FR32X:$src2),
-                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
 let mayLoad = 1 in
 def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
                     (ins FR32X:$src1, f32mem:$src2),
-                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
                     EVEX_CD8<32, CD8VT1>;
 
 // Convert scalar double to scalar single
 def VCVTSD2SSZrr  : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
                       (ins FR64X:$src1, FR64X:$src2),
-                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
 let mayLoad = 1 in
 def VCVTSD2SSZrm  : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
                       (ins FR64X:$src1, f64mem:$src2),
-                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       []>, EVEX_4V, VEX_LIG, VEX_W,
                       Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
 }
@@ -2580,41 +2997,71 @@ def : Pat<(extloadf32 addr:$src),
 def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
            Requires<[HasAVX512]>;
 
-multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, 
+multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, 
                RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, 
                X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
                Domain d> {
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+              !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"),
+              [], d>, EVEX, EVEX_B, EVEX_RC;
   let mayLoad = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 }
 
-defm VCVTPD2PSZ : avx512_vcvt_fp<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
+multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC,
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
+               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
+               Domain d> {
+let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
+} // hasSideEffects = 0
+}
+
+defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
                                 memopv8f64, f512mem, v8f32, v8f64,
-                                SSEPackedSingle>, EVEX_V512, VEX_W, OpSize,
+                                SSEPackedSingle>, EVEX_V512, VEX_W, PD,
                                 EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
                                 memopv4f64, f256mem, v8f64, v8f32,
-                                SSEPackedDouble>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+                                SSEPackedDouble>, EVEX_V512, PS,
+                                EVEX_CD8<32, CD8VH>;
 def : Pat<(v8f64 (extloadv8f32 addr:$src)),
             (VCVTPS2PDZrm addr:$src)>;
+            
+def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
+                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))),
+          (VCVTPD2PSZrr VR512:$src)>;
+
+def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
+                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), imm:$rc)),
+          (VCVTPD2PSZrrb VR512:$src, imm:$rc)>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Vector convert from sign integer to float/double
 //===----------------------------------------------------------------------===//
 
-defm VCVTDQ2PSZ : avx512_vcvt_fp<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
+defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
                                 memopv8i64, i512mem, v16f32, v16i32,
-                                SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                                SSEPackedSingle>, EVEX_V512, PS,
+                                EVEX_CD8<32, CD8VF>;
 
 defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
                                 memopv4i64, i256mem, v8f64, v8i32,
@@ -2628,25 +3075,35 @@ defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
 
 defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
                                  memopv8f64, f512mem, v8i32, v8f64, 
-                                 SSEPackedDouble>, EVEX_V512, OpSize, VEX_W,
+                                 SSEPackedDouble>, EVEX_V512, PD, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
                                  memopv16f32, f512mem, v16i32, v16f32,
-                                 SSEPackedSingle>, EVEX_V512, 
+                                 SSEPackedSingle>, EVEX_V512, PS,
                                  EVEX_CD8<32, CD8VF>;
 
+// cvttps2udq (src, 0, mask-all-ones, sae-current)
+def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src),
+                   (v16i32 immAllZerosV), (i16 -1), FROUND_CURRENT)),
+          (VCVTTPS2UDQZrr VR512:$src)>;
+
 defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
                                  memopv8f64, f512mem, v8i32, v8f64,
-                                 SSEPackedDouble>, EVEX_V512, VEX_W,
+                                 SSEPackedDouble>, EVEX_V512, PS, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
                                  
+// cvttpd2udq (src, 0, mask-all-ones, sae-current)
+def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src),
+                   (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)),
+          (VCVTTPD2UDQZrr VR512:$src)>;
+
 defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
                                  memopv4i64, f256mem, v8f64, v8i32,
                                  SSEPackedDouble>, EVEX_V512, XS,
                                  EVEX_CD8<32, CD8VH>;
                                  
-defm VCVTUDQ2PSZ : avx512_vcvt_fp<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
+defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
                                  memopv16i32, f512mem, v16f32, v16i32,
                                  SSEPackedSingle>, EVEX_V512, XD,
                                  EVEX_CD8<32, CD8VF>;
@@ -2656,22 +3113,65 @@ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
                                  
 
-def : Pat<(int_x86_avx512_cvtdq2_ps_512 VR512:$src),
-          (VCVTDQ2PSZrr VR512:$src)>;
-def : Pat<(int_x86_avx512_cvtdq2_ps_512 (bitconvert (memopv8i64 addr:$src))),
-          (VCVTDQ2PSZrm addr:$src)>;
+def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
+                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
+          (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
+def : Pat<(v8f64 (int_x86_avx512_mask_cvtdq2pd_512 (v8i32 VR256X:$src),
+                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VCVTDQ2PDZrr VR256X:$src)>;
+def : Pat<(v16f32 (int_x86_avx512_mask_cvtudq2ps_512 (v16i32 VR512:$src),
+                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
+          (VCVTUDQ2PSZrrb VR512:$src, imm:$rc)>;
+def : Pat<(v8f64 (int_x86_avx512_mask_cvtudq2pd_512 (v8i32 VR256X:$src),
+                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VCVTUDQ2PDZrr VR256X:$src)>;
+
+multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC,
+               RegisterClass DstRC, PatFrag mem_frag,
+               X86MemOperand x86memop, Domain d> {
+let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [], d>, EVEX;
+  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+              !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"),
+              [], d>, EVEX, EVEX_B, EVEX_RC;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm," \t{$src, $dst|$dst, $src}"),
+              [], d>, EVEX;
+} // hasSideEffects = 0
+}
+
+defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512,
+                                 memopv16f32, f512mem, SSEPackedSingle>, PD,
+                                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X,
+                                 memopv8f64, f512mem, SSEPackedDouble>, XD, VEX_W,
+                                 EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src),
+                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
+           (VCVTPS2DQZrrb VR512:$src, imm:$rc)>;
+
+def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src),
+                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
+           (VCVTPD2DQZrrb VR512:$src, imm:$rc)>;
 
-def VCVTPS2DQZrr : AVX512BI<0x5B, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR512:$dst,
-                          (int_x86_avx512_cvt_ps2dq_512 VR512:$src))],
-                        IIC_SSE_CVT_PS_RR>, EVEX, EVEX_V512;
-def VCVTPS2DQZrm : AVX512BI<0x5B, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR512:$dst,
-                          (int_x86_avx512_cvt_ps2dq_512 (memopv16f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512,
+                                 memopv16f32, f512mem, SSEPackedSingle>,
+                                 PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X,
+                                 memopv8f64, f512mem, SSEPackedDouble>, VEX_W,
+                                 PS, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
+def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src),
+                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
+           (VCVTPS2UDQZrrb VR512:$src, imm:$rc)>;
+
+def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2udq_512 (v8f64 VR512:$src),
+                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
+           (VCVTPD2UDQZrrb VR512:$src, imm:$rc)>;
 
 let Predicates = [HasAVX512] in {
   def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
@@ -2683,234 +3183,279 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
-multiclass avx512_f16c_ph2ps<RegisterClass destRC, RegisterClass srcRC,
-                             X86MemOperand x86memop, Intrinsic Int> {
+multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop> {
   def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set destRC:$dst, (Int srcRC:$src))]>, EVEX;
-  let neverHasSideEffects = 1, mayLoad = 1 in
+             []>, EVEX;
+  let hasSideEffects = 0, mayLoad = 1 in
   def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
 }
 
-multiclass avx512_f16c_ps2ph<RegisterClass destRC, RegisterClass srcRC,
-                             X86MemOperand x86memop, Intrinsic Int> {
+multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop> {
   def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
                (ins srcRC:$src1, i32i8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               [(set destRC:$dst, (Int srcRC:$src1, imm:$src2))]>, EVEX;
-  let neverHasSideEffects = 1, mayStore = 1 in
+               "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               []>, EVEX;
+  let hasSideEffects = 0, mayStore = 1 in
   def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2),
-               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+               "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
 }
 
-defm VCVTPH2PSZ : avx512_f16c_ph2ps<VR512, VR256X, f256mem,
-                                    int_x86_avx512_vcvtph2ps_512>, EVEX_V512,
+defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512,
                                     EVEX_CD8<32, CD8VH>;
-defm VCVTPS2PHZ : avx512_f16c_ps2ph<VR256X, VR512, f256mem,
-                                    int_x86_avx512_vcvtps2ph_512>, EVEX_V512,
+defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512,
                                     EVEX_CD8<32, CD8VH>;
 
+def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src),
+           imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))),
+           (VCVTPS2PHZrr VR512:$src, imm:$rc)>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src),
+           (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))),
+           (VCVTPH2PSZrr VR256X:$src)>;
+
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
-                                 "ucomiss{z}">, TB, EVEX, VEX_LIG,
+                                 "ucomiss">, PS, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd{z}">, TB, OpSize, EVEX,
+                                  "ucomisd">, PD, EVEX,
                                   VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   let Pattern = []<dag> in {
     defm VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
-                                   "comiss{z}">, TB, EVEX, VEX_LIG,
+                                   "comiss">, PS, EVEX, VEX_LIG,
                                    EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
-                                   "comisd{z}">, TB, OpSize, EVEX,
+                                   "comisd">, PD, EVEX,
                                     VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
-  defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
-                            load, "ucomiss">, TB, EVEX, VEX_LIG,
-                            EVEX_CD8<32, CD8VT1>;
-  defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
-                            load, "ucomisd">, TB, OpSize, EVEX,
-                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
-
-  defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
-                            load, "comiss">, TB, EVEX, VEX_LIG,
-                            EVEX_CD8<32, CD8VT1>;
-  defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
-                            load, "comisd">, TB, OpSize, EVEX,
-                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+    defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
 }
   
-/// avx512_unop_p - AVX-512 unops in packed form.
-multiclass avx512_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
-  def PSZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                        !strconcat(OpcodeStr,
-                                   "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))]>,
-                        EVEX, EVEX_V512;
-  def PSZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f256mem:$src),
-                        !strconcat(OpcodeStr,
-                                   "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
-                        EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-  def PDZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                        !strconcat(OpcodeStr,
-                                   "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))]>,
-                        EVEX, EVEX_V512, VEX_W;
-  def PDZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                        !strconcat(OpcodeStr,
-                                   "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
-                        EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-}
-
-/// avx512_fp_unop_p_int - AVX-512 intrinsics unops in packed forms.
-multiclass avx512_fp_unop_p_int<bits<8> opc, string OpcodeStr,
-                              Intrinsic V16F32Int, Intrinsic V8F64Int> {
-  def PSZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                           !strconcat(OpcodeStr,
-                                      "ps\t{$src, $dst|$dst, $src}"),
-                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
-                           EVEX, EVEX_V512;
-  def PSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                         !strconcat(OpcodeStr,
-                         "ps\t{$src, $dst|$dst, $src}"),
-                         [(set VR512:$dst, 
-                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
-                         EVEX_V512, EVEX_CD8<32, CD8VF>;
-  def PDZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                           !strconcat(OpcodeStr,
-                                      "pd\t{$src, $dst|$dst, $src}"),
-                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
-                           EVEX, EVEX_V512, VEX_W;
-  def PDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                         !strconcat(OpcodeStr,
-                         "pd\t{$src, $dst|$dst, $src}"),
-                         [(set VR512:$dst, 
-                           (V8F64Int (memopv8f64 addr:$src)))]>,
-                            EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-}
-
-/// avx512_fp_unop_s - AVX-512 unops in scalar form.
-multiclass avx512_fp_unop_s<bits<8> opc, string OpcodeStr> {
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                            X86MemOperand x86memop> {
   let hasSideEffects = 0 in {
-  def SSZr : AVX5128I<opc, MRMSrcReg, (outs FR32X:$dst),
-               (ins FR32X:$src1, FR32X:$src2),
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
                !strconcat(OpcodeStr,
-                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, EVEX_4V;
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
   let mayLoad = 1 in {
-  def SSZm : AVX5128I<opc, MRMSrcMem, (outs FR32X:$dst),
-               (ins FR32X:$src1, f32mem:$src2),
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86memop:$src2),
                !strconcat(OpcodeStr,
-                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                      []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-  def SSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
-                   (ins VR128X:$src1, ssmem:$src2),
-                   !strconcat(OpcodeStr,
-                              "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
   }
-  def SDZr : AVX5128I<opc, MRMSrcReg, (outs FR64X:$dst),
-               (ins FR64X:$src1, FR64X:$src2),
+}
+}
+
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))),
+           (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_frag, ValueType OpVt> {
+  def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                        !strconcat(OpcodeStr,
+                                   " \t{$src, $dst|$dst, $src}"),
+                        [(set RC:$dst, (OpVt (OpNode RC:$src)))]>,
+                        EVEX;
+  def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                        [(set RC:$dst, (OpVt (OpNode (mem_frag addr:$src))))]>,
+                        EVEX;
+}
+defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem,
+                        memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem,
+                        memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem,
+                        memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem,
+                        memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+           (VRSQRT14PSZr VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VRSQRT14PDZr VR512:$src)>;
+
+def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+           (VRCP14PSZr VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+           (VRCP14PDZr VR512:$src)>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                            X86MemOperand x86memop> {
+  let hasSideEffects = 0, Predicates = [HasERI] in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
                !strconcat(OpcodeStr,
-                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
-                      EVEX_4V, VEX_W;
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
+  def rrb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+               " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"),
+               []>, EVEX_4V, EVEX_B;
   let mayLoad = 1 in {
-  def SDZm : AVX5128I<opc, MRMSrcMem, (outs FR64X:$dst),
-               (ins FR64X:$src1, f64mem:$src2),
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86memop:$src2),
                !strconcat(OpcodeStr,
-                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
-               EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
-  def SDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
-                  (ins VR128X:$src1, sdmem:$src2),
-                   !strconcat(OpcodeStr,
-                              "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  []>, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+               " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V;
   }
 }
 }
 
-defm VRCP14   : avx512_fp_unop_s<0x4D, "vrcp14">,
-                avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>,
-                avx512_fp_unop_p_int<0x4C, "vrcp14", 
-                    int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>;
-
-defm VRSQRT14  : avx512_fp_unop_s<0x4F, "vrsqrt14">,
-                avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>,
-                avx512_fp_unop_p_int<0x4E, "vrsqrt14", 
-                    int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>;
-
-def : Pat<(int_x86_avx512_rsqrt14_ss VR128X:$src),
-          (COPY_TO_REGCLASS (VRSQRT14SSZr (f32 (IMPLICIT_DEF)),
-                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
-                            VR128X)>;
-def : Pat<(int_x86_avx512_rsqrt14_ss sse_load_f32:$src),
-          (VRSQRT14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
-def : Pat<(int_x86_avx512_rcp14_ss VR128X:$src),
-          (COPY_TO_REGCLASS (VRCP14SSZr (f32 (IMPLICIT_DEF)),
-                                      (COPY_TO_REGCLASS VR128X:$src, FR32)),
-                            VR128X)>;
-def : Pat<(int_x86_avx512_rcp14_ss sse_load_f32:$src),
-          (VRCP14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
-let AddedComplexity = 20, Predicates = [HasERI] in {
-defm VRCP28   : avx512_fp_unop_s<0xCB, "vrcp28">,
-                avx512_fp_unop_p<0xCA, "vrcp28", X86frcp>,
-                avx512_fp_unop_p_int<0xCA, "vrcp28",
-                    int_x86_avx512_rcp28_ps_512, int_x86_avx512_rcp28_pd_512>;
-
-defm VRSQRT28  : avx512_fp_unop_s<0xCD, "vrsqrt28">,
-                avx512_fp_unop_p<0xCC, "vrsqrt28", X86frsqrt>,
-                avx512_fp_unop_p_int<0xCC, "vrsqrt28",
-                    int_x86_avx512_rsqrt28_ps_512, int_x86_avx512_rsqrt28_pd_512>;
-}
-
-let Predicates = [HasERI] in {
-  def : Pat<(int_x86_avx512_rsqrt28_ss VR128X:$src),
-            (COPY_TO_REGCLASS (VRSQRT28SSZr (f32 (IMPLICIT_DEF)),
-                                         (COPY_TO_REGCLASS VR128X:$src, FR32)),
-                              VR128X)>;
-  def : Pat<(int_x86_avx512_rsqrt28_ss sse_load_f32:$src),
-            (VRSQRT28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
-
-  def : Pat<(int_x86_avx512_rcp28_ss VR128X:$src),
-            (COPY_TO_REGCLASS (VRCP28SSZr (f32 (IMPLICIT_DEF)),
-                                       (COPY_TO_REGCLASS VR128X:$src, FR32)),
-                              VR128X)>;
-  def : Pat<(int_x86_avx512_rcp28_ss sse_load_f32:$src),
-            (VRCP28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+defm VRCP28SS   : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRCP28SD   : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VRSQRT28SS   : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>,
+                  EVEX_CD8<32, CD8VT1>;
+defm VRSQRT28SD   : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1),
+              (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;
+
+def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1),
+              (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),
+                   FROUND_NO_EXC)),
+           (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),
+                       (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
+
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr,
+                         RegisterClass RC, X86MemOperand x86memop> {
+  let hasSideEffects = 0, Predicates = [HasERI] in {
+  def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                        !strconcat(OpcodeStr,
+                                   " \t{$src, $dst|$dst, $src}"),
+                        []>, EVEX;
+  def rb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                        !strconcat(OpcodeStr,
+                                   " \t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                        []>, EVEX, EVEX_B;
+  def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                        []>, EVEX;
+  }
 }
+defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>,
+                        EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>,
+                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>,
+                        EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>,
+                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
+           (VRSQRT28PSZrb VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
+           (VRSQRT28PDZrb VR512:$src)>;
+
+def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src),
+              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
+           (VRCP28PSZrb VR512:$src)>;
+def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src),
+              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
+           (VRCP28PDZrb VR512:$src)>;
+
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               Intrinsic V16F32Int, Intrinsic V8F64Int,
                               OpndItins itins_s, OpndItins itins_d> {
   def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
              [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>,
              EVEX, EVEX_V512;
 
   let mayLoad = 1 in
   def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
               [(set VR512:$dst, 
                 (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))],
               itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
   def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
               [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>,
               EVEX, EVEX_V512;
 
   let mayLoad = 1 in
     def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                 [(set VR512:$dst, (OpNode
                   (v8f64 (bitconvert (memopv16f32 addr:$src)))))],
                 itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
+let isCodeGenOnly = 1 in {
   def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
                            !strconcat(OpcodeStr,
                                       "ps\t{$src, $dst|$dst, $src}"),
@@ -2929,7 +3474,8 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          !strconcat(OpcodeStr,
                          "pd\t{$src, $dst|$dst, $src}"),
                          [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>,
-                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; 
+} // isCodeGenOnly = 1
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
@@ -2938,12 +3484,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
   def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
                (ins FR32X:$src1, FR32X:$src2),
                !strconcat(OpcodeStr,
-                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [], itins_s.rr>, XS, EVEX_4V;
+  let isCodeGenOnly = 1 in
   def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
                (ins VR128X:$src1, VR128X:$src2),
                !strconcat(OpcodeStr,
-                "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128X:$dst, 
                  (F32Int VR128X:$src1, VR128X:$src2))],
                itins_s.rr>, XS, EVEX_4V;
@@ -2951,12 +3498,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
   def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
                (ins FR32X:$src1, f32mem:$src2),
                !strconcat(OpcodeStr,
-                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  let isCodeGenOnly = 1 in
   def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
                    (ins VR128X:$src1, ssmem:$src2),
                    !strconcat(OpcodeStr,
-                 "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR128X:$dst, 
                      (F32Int VR128X:$src1, sse_load_f32:$src2))],
                    itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
@@ -2964,12 +3512,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
   def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
                (ins FR64X:$src1, FR64X:$src2),
                !strconcat(OpcodeStr,
-                          "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
                       XD, EVEX_4V, VEX_W;
+  let isCodeGenOnly = 1 in
   def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
                (ins VR128X:$src1, VR128X:$src2),
                !strconcat(OpcodeStr,
-                "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128X:$dst, 
                  (F64Int VR128X:$src1, VR128X:$src2))],
                itins_s.rr>, XD, EVEX_4V, VEX_W;
@@ -2977,12 +3526,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
   def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
                (ins FR64X:$src1, f64mem:$src2),
                !strconcat(OpcodeStr,
-                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
                XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let isCodeGenOnly = 1 in
   def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
                   (ins VR128X:$src1, sdmem:$src2),
                    !strconcat(OpcodeStr,
-                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set VR128X:$dst, 
                     (F64Int VR128X:$src1, sse_load_f64:$src2))]>, 
                   XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -3010,15 +3560,15 @@ let Predicates = [HasAVX512] in {
             Requires<[OptForSize]>;
 
   def : Pat<(f32 (X86frsqrt FR32X:$src)),
-            (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+            (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
   def : Pat<(f32 (X86frsqrt (load addr:$src))),
-            (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
             Requires<[OptForSize]>;
 
   def : Pat<(f32 (X86frcp FR32X:$src)),
-            (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+            (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
   def : Pat<(f32 (X86frcp (load addr:$src))),
-            (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>,
             Requires<[OptForSize]>;
 
   def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
@@ -3094,6 +3644,7 @@ let ExeDomain = GenericDomain in {
       []>;
 
   // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
   def SSr_Int : AVX512AIi8<opcss, MRMSrcReg,
         (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
         !strconcat(OpcodeStr,
@@ -3118,6 +3669,7 @@ let ExeDomain = GenericDomain in {
         []>, VEX_W;
 
   // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
   def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg,
         (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
         !strconcat(OpcodeStr,
@@ -3136,18 +3688,70 @@ let ExeDomain = GenericDomain in {
 } // ExeDomain = GenericDomain
 }
 
-let Predicates = [HasAVX512] in {
-  defm VRNDSCALE  : avx512_fp_binop_rm<0x0A, 0x0B, "vrndscale",
-                              int_x86_avx512_rndscale_ss,
-                              int_x86_avx512_rndscale_sd>, EVEX_4V;
+multiclass avx512_rndscale<bits<8> opc, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag, Domain d> {
+let ExeDomain = d in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def r : AVX512AIi8<opc, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX;
 
-  defm VRNDSCALEZ : avx512_fp_unop_rm<0x08, 0x09, "vrndscale", f256mem, VR512,
-                                  memopv16f32, memopv8f64,
-                                  int_x86_avx512_rndscale_ps_512,
-                                  int_x86_avx512_rndscale_pd_512, CD8VF>, 
-                                  EVEX, EVEX_V512;
+  // Vector intrinsic operation, mem
+  def m : AVX512AIi8<opc, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX;
+} // ExeDomain
 }
 
+
+defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
+                                memopv16f32, SSEPackedSingle>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
+                   imm:$src2, (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1),
+                   FROUND_CURRENT)),
+                   (VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
+
+
+defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
+                                memopv8f64, SSEPackedDouble>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
+                  imm:$src2, (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1),
+                  FROUND_CURRENT)),
+                   (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
+
+multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
+                     Operand x86memop, RegisterClass RC, Domain d> {
+let ExeDomain = d in {
+  def r : AVX512AIi8<opc, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, RC:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX_4V;
+
+  def m : AVX512AIi8<opc, MRMSrcMem,
+                    (outs RC:$dst), (ins RC:$src1, x86memop:$src2,  i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, EVEX_4V;
+} // ExeDomain
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X,
+                                SSEPackedSingle>, EVEX_CD8<32, CD8VT1>;
+                                
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X,
+                                SSEPackedDouble>, EVEX_CD8<64, CD8VT1>;
+
 def : Pat<(ffloor FR32X:$src),
           (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>;
 def : Pat<(f64 (ffloor FR64X:$src)),
@@ -3170,26 +3774,26 @@ def : Pat<(f64 (ftrunc FR64X:$src)),
           (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
 
 def : Pat<(v16f32 (ffloor VR512:$src)),
-          (VRNDSCALEZPSr VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPSZr VR512:$src, (i32 0x1))>;
 def : Pat<(v16f32 (fnearbyint VR512:$src)),
-          (VRNDSCALEZPSr VR512:$src, (i32 0xC))>;
+          (VRNDSCALEPSZr VR512:$src, (i32 0xC))>;
 def : Pat<(v16f32 (fceil VR512:$src)),
-          (VRNDSCALEZPSr VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPSZr VR512:$src, (i32 0x2))>;
 def : Pat<(v16f32 (frint VR512:$src)),
-          (VRNDSCALEZPSr VR512:$src, (i32 0x4))>;
+          (VRNDSCALEPSZr VR512:$src, (i32 0x4))>;
 def : Pat<(v16f32 (ftrunc VR512:$src)),
-          (VRNDSCALEZPSr VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPSZr VR512:$src, (i32 0x3))>;
 
 def : Pat<(v8f64 (ffloor VR512:$src)),
-          (VRNDSCALEZPDr VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPDZr VR512:$src, (i32 0x1))>;
 def : Pat<(v8f64 (fnearbyint VR512:$src)),
-          (VRNDSCALEZPDr VR512:$src, (i32 0xC))>;
+          (VRNDSCALEPDZr VR512:$src, (i32 0xC))>;
 def : Pat<(v8f64 (fceil VR512:$src)),
-          (VRNDSCALEZPDr VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPDZr VR512:$src, (i32 0x2))>;
 def : Pat<(v8f64 (frint VR512:$src)),
-          (VRNDSCALEZPDr VR512:$src, (i32 0x4))>;
+          (VRNDSCALEPDZr VR512:$src, (i32 0x4))>;
 def : Pat<(v8f64 (ftrunc VR512:$src)),
-          (VRNDSCALEZPDr VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPDZr VR512:$src, (i32 0x3))>;
 
 //-------------------------------------------------
 // Integer truncate and extend operations
@@ -3200,17 +3804,17 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
                           RegisterClass KRC, X86MemOperand x86memop> {
   def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins srcRC:$src),
-               !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
                []>, EVEX;
 
   def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins KRC:$mask, srcRC:$src),
                !strconcat(OpcodeStr,
-                 "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                 " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                []>, EVEX, EVEX_KZ;
 
   def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
-               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                []>, EVEX;
 }
 defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
@@ -3266,11 +3870,11 @@ multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
 
   def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
               (ins SrcRC:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
   def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins x86memop:$src),
-              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+              !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
               EVEX;
@@ -3318,18 +3922,23 @@ let mayLoad = 1,
   def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb),
             (ins RC:$src1, KRC:$mask, memop:$src2),
             !strconcat(OpcodeStr,
-            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
             []>, EVEX, EVEX_K;
 }
+
+let ExeDomain = SSEPackedDouble in {
 defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>,
                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
-                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
-
 defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>,
                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
 defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>,
                                  EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
   
 defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512,  vy64xmem>,
                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -3347,20 +3956,24 @@ let mayStore = 1, Constraints = "$mask = $mask_wb" in
   def mr  : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb),
             (ins memop:$dst, KRC:$mask, RC:$src2),
             !strconcat(OpcodeStr,
-            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
             []>, EVEX, EVEX_K;
 }
 
+let ExeDomain = SSEPackedDouble in {
 defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>,
                                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
-                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
-
 defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>,
                                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
 defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>,
                                    EVEX_V512, EVEX_CD8<32, CD8VT1>;
-  
+}
+
 defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>,
                                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>,
@@ -3380,23 +3993,23 @@ multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
   def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2, i8imm:$src3),
                    !strconcat(OpcodeStr,
-                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
                                        (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
                    EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
   def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, i8imm:$src3),
                    !strconcat(OpcodeStr,
-                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
                                        (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
                    EVEX_4V, Sched<[WriteShuffle]>;
 }
 
 defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
-                  SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                  SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
-                  SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+                  SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
           (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
@@ -3415,13 +4028,13 @@ multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
   def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
                      (ins RC:$src1, RC:$src2, i8imm:$src3),
                      !strconcat(OpcodeStr,
-                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                      []>, EVEX_4V;
   let mayLoad = 1 in
   def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
                      (ins RC:$src1, x86memop:$src2, i8imm:$src3),
                      !strconcat(OpcodeStr,
-                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                      []>, EVEX_4V;
 }
 defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
@@ -3438,54 +4051,111 @@ def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
 def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
           (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
 
-multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                       X86MemOperand x86memop> {
-  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
-                    EVEX;
-  def rm  : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), 
-                   (ins x86memop:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
-                   EVEX;
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64  : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+
+multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                        RegisterClass KRC, RegisterClass RC,
+                        X86MemOperand x86memop, X86MemOperand x86scalar_mop,
+                        string BrdcstStr> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+            !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+            []>, EVEX;
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
+             !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+             []>, EVEX, EVEX_K;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
+              !strconcat(OpcodeStr,
+                         " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>, EVEX, EVEX_KZ;
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+              (ins x86memop:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+              []>, EVEX;
+    def rmk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+               (ins KRC:$mask, x86memop:$src),
+               !strconcat(OpcodeStr,
+                          " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+                (ins KRC:$mask, x86memop:$src),
+                !strconcat(OpcodeStr,
+                           " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                []>, EVEX, EVEX_KZ;
+    def rmb : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+               (ins x86scalar_mop:$src),
+               !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                          ", $dst|$dst, ${src}", BrdcstStr, "}"),
+               []>, EVEX, EVEX_B;
+    def rmbk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+                (ins KRC:$mask, x86scalar_mop:$src),
+                !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                           ", $dst {${mask}}|$dst {${mask}}, ${src}", BrdcstStr, "}"),
+                []>, EVEX, EVEX_B, EVEX_K;
+    def rmbkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst),
+                 (ins KRC:$mask, x86scalar_mop:$src),
+                 !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
+                            ", $dst {${mask}} {z}|$dst {${mask}} {z}, ${src}",
+                            BrdcstStr, "}"),
+                 []>, EVEX, EVEX_B, EVEX_KZ;
+  }
 }
 
-defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512,
-                        EVEX_CD8<32, CD8VF>;
-defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W,
-                        EVEX_CD8<64, CD8VF>;
+defm VPABSDZ : avx512_vpabs<0x1E, "vpabsd", v16i32, VK16WM, VR512,
+                           i512mem, i32mem, "{1to16}">, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPABSQZ : avx512_vpabs<0x1F, "vpabsq", v8i64, VK8WM, VR512,
+                           i512mem, i64mem, "{1to8}">, EVEX_V512, VEX_W,
+                           EVEX_CD8<64, CD8VF>;
+
+def : Pat<(xor
+          (bc_v16i32 (v16i1sextv16i32)),
+          (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+          (VPABSDZrr VR512:$src)>;
+def : Pat<(xor
+          (bc_v8i64 (v8i1sextv8i64)),
+          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
+          (VPABSQZrr VR512:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_pabs_d_512 (v16i32 VR512:$src),
+                   (v16i32 immAllZerosV), (i16 -1))),
+          (VPABSDZrr VR512:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src),
+                   (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
+          (VPABSQZrr VR512:$src)>;
 
 multiclass avx512_conflict<bits<8> opc, string OpcodeStr, 
-                        RegisterClass RC, RegisterClass KRC, PatFrag memop_frag,
-                        X86MemOperand x86memop, PatFrag scalar_mfrag,
-                        X86MemOperand x86scalar_mop, string BrdcstStr,
-                        Intrinsic Int, Intrinsic maskInt, Intrinsic maskzInt> {
+                        RegisterClass RC, RegisterClass KRC,
+                        X86MemOperand x86memop,
+                        X86MemOperand x86scalar_mop, string BrdcstStr> {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src),
-       !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
-       [(set RC:$dst, (Int RC:$src))]>, EVEX;
+       !strconcat(OpcodeStr, " \t{$src, ${dst} |${dst}, $src}"),
+       []>, EVEX;
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins x86memop:$src),
-       !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
-       [(set RC:$dst, (Int (memop_frag addr:$src)))]>, EVEX;
+       !strconcat(OpcodeStr, " \t{$src, ${dst}|${dst}, $src}"),
+       []>, EVEX;
   def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins x86scalar_mop:$src),
-       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+       !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
                   ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
        []>, EVEX, EVEX_B;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
        (ins KRC:$mask, RC:$src),
        !strconcat(OpcodeStr,
-                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-       [(set RC:$dst, (maskzInt KRC:$mask, RC:$src))]>, EVEX, EVEX_KZ;
+                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       []>, EVEX, EVEX_KZ;
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins KRC:$mask, x86memop:$src),
        !strconcat(OpcodeStr,
-                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-       [(set RC:$dst, (maskzInt KRC:$mask, (memop_frag addr:$src)))]>,
-       EVEX, EVEX_KZ;
+                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       []>, EVEX, EVEX_KZ;
   def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins KRC:$mask, x86scalar_mop:$src),
-       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+       !strconcat(OpcodeStr, " \t{${src}", BrdcstStr,
                   ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
                   BrdcstStr, "}"),
        []>, EVEX, EVEX_KZ, EVEX_B;
@@ -3494,16 +4164,16 @@ multiclass avx512_conflict<bits<8> opc, string OpcodeStr,
   def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, KRC:$mask, RC:$src2),
        !strconcat(OpcodeStr,
-                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, RC:$src2))]>, EVEX, EVEX_K;
+                  " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       []>, EVEX, EVEX_K;
   def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, KRC:$mask, x86memop:$src2),
        !strconcat(OpcodeStr,
-                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, (memop_frag addr:$src2)))]>, EVEX, EVEX_K;
+                  " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       []>, EVEX, EVEX_K;
   def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
-       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+       !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
                   ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
        []>, EVEX, EVEX_K, EVEX_B;
    }
@@ -3511,16 +4181,22 @@ multiclass avx512_conflict<bits<8> opc, string OpcodeStr,
 
 let Predicates = [HasCDI] in {
 defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
-                    memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                    int_x86_avx512_conflict_d_512,
-                    int_x86_avx512_conflict_d_mask_512,
-                    int_x86_avx512_conflict_d_maskz_512>,
+                    i512mem, i32mem, "{1to16}">,
                     EVEX_V512, EVEX_CD8<32, CD8VF>;
 
+
 defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
-                    memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                    int_x86_avx512_conflict_q_512,
-                    int_x86_avx512_conflict_q_mask_512,
-                    int_x86_avx512_conflict_q_maskz_512>,
+                    i512mem, i64mem, "{1to8}">,
                     EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
 }
+
+def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1,
+                                              GR16:$mask),
+          (VPCONFLICTDrrk VR512:$src1,
+           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
+
+def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
+                                              GR8:$mask),
+          (VPCONFLICTQrrk VR512:$src1,
+           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 7fc9c44..368e14b 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -18,19 +18,19 @@ let SchedRW = [WriteLEA] in {
 let neverHasSideEffects = 1 in
 def LEA16r   : I<0x8D, MRMSrcMem,
                  (outs GR16:$dst), (ins i32mem:$src),
-                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize;
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
 let isReMaterializable = 1 in
 def LEA32r   : I<0x8D, MRMSrcMem,
                  (outs GR32:$dst), (ins i32mem:$src),
                  "lea{l}\t{$src|$dst}, {$dst|$src}",
                  [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
-                 Requires<[In32BitMode]>;
+                 OpSize32, Requires<[Not64BitMode]>;
 
 def LEA64_32r : I<0x8D, MRMSrcMem,
                   (outs GR32:$dst), (ins lea64_32mem:$src),
                   "lea{l}\t{$src|$dst}, {$dst|$src}",
                   [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
-                  Requires<[In64BitMode]>;
+                  OpSize32, Requires<[In64BitMode]>;
 
 let isReMaterializable = 1 in
 def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
@@ -68,13 +68,13 @@ def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
 let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
 def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
                "mul{w}\t$src",
-               [], IIC_MUL16_REG>, OpSize, Sched<[WriteIMul]>;
+               [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
 // EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
 def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
                "mul{l}\t$src",
                [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
-               IIC_MUL32_REG>, Sched<[WriteIMul]>;
+               IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
 // RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
 def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
@@ -95,12 +95,12 @@ let mayLoad = 1, neverHasSideEffects = 1 in {
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
                "mul{w}\t$src",
-               [], IIC_MUL16_MEM>, OpSize, SchedLoadReg<WriteIMulLd>;
+               [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
 // EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
               "mul{l}\t$src",
-              [], IIC_MUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+              [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
@@ -115,11 +115,11 @@ def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
 // AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
-              IIC_IMUL16_RR>, OpSize, Sched<[WriteIMul]>;
+              IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
 // EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
-              IIC_IMUL32_RR>, Sched<[WriteIMul]>;
+              IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
 // RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
@@ -133,12 +133,13 @@ def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
 // AX,DX = AX*[mem16]
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
-                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize,
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
               SchedLoadReg<WriteIMulLd>;
 // EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
-                "imul{l}\t$src", [], IIC_IMUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
+              SchedLoadReg<WriteIMulLd>;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
@@ -157,12 +158,12 @@ def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
                  [(set GR16:$dst, EFLAGS,
                        (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
-                       TB, OpSize;
+                       TB, OpSize16;
 def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, EFLAGS,
                        (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
-                 TB;
+                 TB, OpSize32;
 def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
                                    (ins GR64:$src1, GR64:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
@@ -179,14 +180,14 @@ def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
                  [(set GR16:$dst, EFLAGS,
                        (X86smul_flag GR16:$src1, (load addr:$src2)))],
                        IIC_IMUL16_RM>,
-               TB, OpSize;
+               TB, OpSize16;
 def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
                  (ins GR32:$src1, i32mem:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
                  [(set GR32:$dst, EFLAGS,
                        (X86smul_flag GR32:$src1, (load addr:$src2)))],
                        IIC_IMUL32_RM>,
-               TB;
+               TB, OpSize32;
 def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
                                    (ins GR64:$src1, i64mem:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
@@ -208,30 +209,29 @@ def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR16:$dst, EFLAGS,
                             (X86smul_flag GR16:$src1, imm:$src2))],
-                            IIC_IMUL16_RRI>, OpSize;
+                            IIC_IMUL16_RRI>, OpSize16;
 def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
                      (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR16:$dst, EFLAGS,
                            (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
-                           IIC_IMUL16_RRI>,
-                 OpSize;
+                           IIC_IMUL16_RRI>, OpSize16;
 def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
                       (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR32:$dst, EFLAGS,
                             (X86smul_flag GR32:$src1, imm:$src2))],
-                            IIC_IMUL32_RRI>;
+                            IIC_IMUL32_RRI>, OpSize32;
 def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
                      (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR32:$dst, EFLAGS,
                            (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
-                           IIC_IMUL32_RRI>;
-def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
-                        (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
-                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       [(set GR64:$dst, EFLAGS,
+                           IIC_IMUL32_RRI>, OpSize32;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                         (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
                              (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
                              IIC_IMUL64_RRI>;
 def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
@@ -250,31 +250,31 @@ def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
                       [(set GR16:$dst, EFLAGS,
                             (X86smul_flag (load addr:$src1), imm:$src2))],
                             IIC_IMUL16_RMI>,
-                 OpSize;
+                 OpSize16;
 def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
                      (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR16:$dst, EFLAGS,
                            (X86smul_flag (load addr:$src1),
                                          i16immSExt8:$src2))], IIC_IMUL16_RMI>,
-                                         OpSize;
+                                         OpSize16;
 def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
                       (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set GR32:$dst, EFLAGS,
                             (X86smul_flag (load addr:$src1), imm:$src2))],
-                            IIC_IMUL32_RMI>;
+                            IIC_IMUL32_RMI>, OpSize32;
 def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
                      (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set GR32:$dst, EFLAGS,
                            (X86smul_flag (load addr:$src1),
                                          i32immSExt8:$src2))],
-                                         IIC_IMUL32_RMI>;
-def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
-                        (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
-                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set GR64:$dst, EFLAGS,
+                                         IIC_IMUL32_RMI>, OpSize32;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                         (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
                               (X86smul_flag (load addr:$src1),
                                             i64immSExt32:$src2))],
                                             IIC_IMUL64_RMI>;
@@ -299,10 +299,10 @@ def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "div{b}\t$src", [], IIC_DIV8_REG>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
-               "div{w}\t$src", [], IIC_DIV16>, OpSize;
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
-               "div{l}\t$src", [], IIC_DIV32>;
+               "div{l}\t$src", [], IIC_DIV32>, OpSize32;
 // RDX:RAX/r64 = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
@@ -316,12 +316,12 @@ def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
              SchedLoadReg<WriteIDivLd>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
-               "div{w}\t$src", [], IIC_DIV16>, OpSize,
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16,
              SchedLoadReg<WriteIDivLd>;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
 def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
                "div{l}\t$src", [], IIC_DIV32>,
-             SchedLoadReg<WriteIDivLd>;
+             SchedLoadReg<WriteIDivLd>, OpSize32;
 // RDX:RAX/[mem64] = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
@@ -336,10 +336,10 @@ def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "idiv{b}\t$src", [], IIC_IDIV8>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
-               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
-               "idiv{l}\t$src", [], IIC_IDIV32>;
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
 // RDX:RAX/r64 = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
@@ -353,11 +353,11 @@ def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
              SchedLoadReg<WriteIDivLd>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
-               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize,
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
              SchedLoadReg<WriteIDivLd>;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
 def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
-               "idiv{l}\t$src", [], IIC_IDIV32>,
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
              SchedLoadReg<WriteIDivLd>;
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
 def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
@@ -381,11 +381,11 @@ def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
 def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                "neg{w}\t$dst",
                [(set GR16:$dst, (ineg GR16:$src1)),
-                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize;
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
 def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                "neg{l}\t$dst",
                [(set GR32:$dst, (ineg GR32:$src1)),
-                (implicit EFLAGS)], IIC_UNARY_REG>;
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
 def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
                 [(set GR64:$dst, (ineg GR64:$src1)),
                  (implicit EFLAGS)], IIC_UNARY_REG>;
@@ -400,11 +400,11 @@ def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
 def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
                "neg{w}\t$dst",
                [(store (ineg (loadi16 addr:$dst)), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize;
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
 def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
                "neg{l}\t$dst",
                [(store (ineg (loadi32 addr:$dst)), addr:$dst),
-                (implicit EFLAGS)], IIC_UNARY_MEM>;
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
 def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
                 [(store (ineg (loadi64 addr:$dst)), addr:$dst),
                  (implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -422,10 +422,10 @@ def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
                [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
 def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                "not{w}\t$dst",
-               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize;
+               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
 def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                "not{l}\t$dst",
-               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>;
+               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
 def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
                 [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
 }
@@ -438,10 +438,11 @@ def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
 def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
                "not{w}\t$dst",
                [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
-               OpSize;
+               OpSize16;
 def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
                "not{l}\t$dst",
-               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize32;
 def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
                 [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
 } // SchedRW
@@ -460,12 +461,12 @@ let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
 def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
                "inc{w}\t$dst",
                [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
-             OpSize, Requires<[In32BitMode]>;
+             OpSize16, Requires<[Not64BitMode]>;
 def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
                "inc{l}\t$dst",
                [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
                IIC_UNARY_REG>,
-             Requires<[In32BitMode]>;
+             OpSize32, Requires<[Not64BitMode]>;
 def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
                 IIC_UNARY_REG>;
@@ -479,38 +480,39 @@ def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                   "inc{w}\t$dst",
                   [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
                   IIC_UNARY_REG>,
-                OpSize, Requires<[In64BitMode]>;
+                OpSize16, Requires<[In64BitMode]>;
 def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                   "inc{l}\t$dst",
                   [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
                   IIC_UNARY_REG>,
-                Requires<[In64BitMode]>;
+                OpSize32, Requires<[In64BitMode]>;
 def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                   "dec{w}\t$dst",
                   [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
                   IIC_UNARY_REG>,
-                OpSize, Requires<[In64BitMode]>;
+                OpSize16, Requires<[In64BitMode]>;
 def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                   "dec{l}\t$dst",
                   [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
                   IIC_UNARY_REG>,
-                Requires<[In64BitMode]>;
+                OpSize32, Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
-let isCodeGenOnly = 1, CodeSize = 2 in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    CodeSize = 2 in {
 def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                   "inc{w}\t$dst", [], IIC_UNARY_REG>,
-                OpSize, Requires<[In32BitMode]>;
+                OpSize16, Requires<[Not64BitMode]>;
 def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                   "inc{l}\t$dst", [], IIC_UNARY_REG>,
-                Requires<[In32BitMode]>;
+                OpSize32, Requires<[Not64BitMode]>;
 def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                   "dec{w}\t$dst", [], IIC_UNARY_REG>,
-                OpSize, Requires<[In32BitMode]>;
+                OpSize16, Requires<[Not64BitMode]>;
 def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                   "dec{l}\t$dst", [], IIC_UNARY_REG>,
-                Requires<[In32BitMode]>;
-} // isCodeGenOnly = 1, CodeSize = 2
+                OpSize32, Requires<[Not64BitMode]>;
+} // isCodeGenOnly = 1, ForceDisassemble = 1, HasSideEffects = 0, CodeSize = 2
 
 } // Constraints = "$src1 = $dst", SchedRW
 
@@ -521,11 +523,11 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
   def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
                [(store (add (loadi16 addr:$dst), 1), addr:$dst),
                 (implicit EFLAGS)], IIC_UNARY_MEM>,
-               OpSize, Requires<[In32BitMode]>;
+               OpSize16, Requires<[Not64BitMode]>;
   def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
                [(store (add (loadi32 addr:$dst), 1), addr:$dst),
                 (implicit EFLAGS)], IIC_UNARY_MEM>,
-               Requires<[In32BitMode]>;
+               OpSize32, Requires<[Not64BitMode]>;
   def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), 1), addr:$dst),
                    (implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -536,19 +538,19 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
 def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
                   [(store (add (loadi16 addr:$dst), 1), addr:$dst),
                     (implicit EFLAGS)], IIC_UNARY_MEM>,
-                OpSize, Requires<[In64BitMode]>;
+                OpSize16, Requires<[In64BitMode]>;
 def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
                   [(store (add (loadi32 addr:$dst), 1), addr:$dst),
                     (implicit EFLAGS)], IIC_UNARY_MEM>,
-                Requires<[In64BitMode]>;
+                OpSize32, Requires<[In64BitMode]>;
 def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
                   [(store (add (loadi16 addr:$dst), -1), addr:$dst),
                     (implicit EFLAGS)], IIC_UNARY_MEM>,
-                OpSize, Requires<[In64BitMode]>;
+                OpSize16, Requires<[In64BitMode]>;
 def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
                   [(store (add (loadi32 addr:$dst), -1), addr:$dst),
                     (implicit EFLAGS)], IIC_UNARY_MEM>,
-                Requires<[In64BitMode]>;
+                OpSize32, Requires<[In64BitMode]>;
 } // CodeSize = 2, SchedRW
 
 let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
@@ -562,12 +564,12 @@ def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
                "dec{w}\t$dst",
                [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
                IIC_UNARY_REG>,
-             OpSize, Requires<[In32BitMode]>;
+             OpSize16, Requires<[Not64BitMode]>;
 def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
                "dec{l}\t$dst",
                [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
                IIC_UNARY_REG>,
-             Requires<[In32BitMode]>;
+             OpSize32, Requires<[Not64BitMode]>;
 def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
                 [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
                 IIC_UNARY_REG>;
@@ -582,11 +584,11 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
   def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
                [(store (add (loadi16 addr:$dst), -1), addr:$dst),
                 (implicit EFLAGS)], IIC_UNARY_MEM>,
-               OpSize, Requires<[In32BitMode]>;
+               OpSize16, Requires<[Not64BitMode]>;
   def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
                [(store (add (loadi32 addr:$dst), -1), addr:$dst),
                 (implicit EFLAGS)], IIC_UNARY_MEM>,
-               Requires<[In32BitMode]>;
+               OpSize32, Requires<[Not64BitMode]>;
   def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                   [(store (add (loadi64 addr:$dst), -1), addr:$dst),
                    (implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -600,7 +602,8 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
                   PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
                   Operand immoperand, SDPatternOperator immoperator,
                   Operand imm8operand, SDPatternOperator imm8operator,
-                  bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> {
+                  bit hasOddOpcode, OperandSize opSize,
+                  bit hasREX_WPrefix> {
   /// VT - This is the value type itself.
   ValueType VT = vt;
 
@@ -650,9 +653,10 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
   /// other datatypes are odd.
   bit HasOddOpcode = hasOddOpcode;
 
-  /// HasOpSizePrefix - This bit is set to true if the instruction should have
-  /// the 0x66 operand size prefix.  This is set for i16 types.
-  bit HasOpSizePrefix = hasOpSizePrefix;
+  /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+  /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+  /// to Opsize16. i32 sets this to OpSize32.
+  OperandSize OpSize = opSize;
 
   /// HasREX_WPrefix - This bit is set to true if the instruction should have
   /// the 0x40 REX prefix.  This is set for i64 types.
@@ -664,16 +668,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
 
 def Xi8  : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
                        Imm8 , i8imm ,    imm,          i8imm   , invalid_node,
-                       0, 0, 0>;
+                       0, OpSizeFixed, 0>;
 def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
                        Imm16, i16imm,    imm,          i16i8imm, i16immSExt8,
-                       1, 1, 0>;
+                       1, OpSize16, 0>;
 def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
                        Imm32, i32imm,    imm,          i32i8imm, i32immSExt8,
-                       1, 0, 0>;
+                       1, OpSize32, 0>;
 def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
-                       Imm32, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
-                       1, 0, 1>;
+                       Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+                       1, OpSizeFixed, 1>;
 
 /// ITy - This instruction base class takes the type info for the instruction.
 /// Using this, it:
@@ -693,7 +697,7 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
       itin> {
 
   // Infer instruction prefixes from type info.
-  let hasOpSizePrefix = typeinfo.HasOpSizePrefix;
+  let OpSize = typeinfo.OpSize;
   let hasREX_WPrefix  = typeinfo.HasREX_WPrefix;
 }
 
@@ -752,6 +756,7 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
     Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
   let hasSideEffects = 0;
 }
 
@@ -767,6 +772,7 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
     Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
   let hasSideEffects = 0;
 }
 
@@ -1305,8 +1311,8 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
 }
 
 let Predicates = [HasBMI] in {
@@ -1351,21 +1357,21 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
   let SchedRW = [WriteALU] in {
   def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
              "adcx{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8, OpSize;
+             [], IIC_BIN_NONMEM>, T8PD;
 
-  def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
              "adcx{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+             [], IIC_BIN_NONMEM>, T8PD, Requires<[In64BitMode]>;
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
   def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
              "adcx{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8, OpSize;
+             [], IIC_BIN_MEM>, T8PD;
 
-  def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
              "adcx{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+             [], IIC_BIN_MEM>, T8PD, Requires<[In64BitMode]>;
   }
 }
 
@@ -1378,9 +1384,9 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
              "adox{l}\t{$src, $dst|$dst, $src}",
              [], IIC_BIN_NONMEM>, T8XS;
 
-  def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
              "adox{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+             [], IIC_BIN_NONMEM>, T8XS, Requires<[In64BitMode]>;
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
@@ -1388,8 +1394,8 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
              "adox{l}\t{$src, $dst|$dst, $src}",
              [], IIC_BIN_MEM>, T8XS;
 
-  def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
              "adox{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+             [], IIC_BIN_MEM>, T8XS, Requires<[In64BitMode]>;
   }
 }
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index a967a4d..315f213 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -22,13 +22,13 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
           !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
           [(set GR16:$dst,
                 (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
-                IIC_CMOV16_RR>,TB,OpSize;
+                IIC_CMOV16_RR>, TB, OpSize16;
     def NAME#32rr
       : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
           !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
           [(set GR32:$dst,
                 (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
-                IIC_CMOV32_RR>, TB;
+                IIC_CMOV32_RR>, TB, OpSize32;
     def NAME#64rr
       :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
           !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
@@ -44,12 +44,13 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
           !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
           [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
                                     CondNode, EFLAGS))], IIC_CMOV16_RM>,
-                                    TB, OpSize;
+                                    TB, OpSize16;
     def NAME#32rm
       : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
           !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
           [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
-                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>,
+                                    TB, OpSize32;
     def NAME#64rm
       :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
           !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
@@ -81,11 +82,11 @@ defm CMOVG  : CMOV<0x4F, "cmovg" , X86_COND_G>;
 // SetCC instructions.
 multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
   let Uses = [EFLAGS] in {
-    def r    : I<opc, MRM0r,  (outs GR8:$dst), (ins),
+    def r    : I<opc, MRMXr,  (outs GR8:$dst), (ins),
                      !strconcat(Mnemonic, "\t$dst"),
                      [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
                      IIC_SET_R>, TB, Sched<[WriteALU]>;
-    def m    : I<opc, MRM0m,  (outs), (ins i8mem:$dst),
+    def m    : I<opc, MRMXm,  (outs), (ins i8mem:$dst),
                      !strconcat(Mnemonic, "\t$dst"),
                      [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
                      IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 7d10b67..401849f 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -46,11 +46,11 @@ let Defs = [ESP, EFLAGS], Uses = [ESP] in {
 def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
                            "#ADJCALLSTACKDOWN",
                            [(X86callseq_start timm:$amt)]>,
-                          Requires<[In32BitMode]>;
+                          Requires<[Not64BitMode]>;
 def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
-                          Requires<[In32BitMode]>;
+                          Requires<[Not64BitMode]>;
 }
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -72,7 +72,7 @@ def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
 
 
 // x86-64 va_start lowering magic.
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
 def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
                               (outs),
                               (ins GR8:$al,
@@ -81,7 +81,8 @@ def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
                               "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
                               [(X86vastart_save_xmm_regs GR8:$al,
                                                          imm:$regsavefi,
-                                                         imm:$offset)]>;
+                                                         imm:$offset),
+                               (implicit EFLAGS)]>;
 
 // The VAARG_64 pseudo-instruction takes the address of the va_list,
 // and places the address of the next argument into a register.
@@ -117,7 +118,7 @@ def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
                       "# variable sized alloca for segmented stacks",
                       [(set GR32:$dst,
                          (X86SegAlloca GR32:$size))]>,
-                    Requires<[In32BitMode]>;
+                    Requires<[Not64BitMode]>;
 
 let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
 def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
@@ -139,12 +140,12 @@ let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
   def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
                       "# win32 fptoui",
                       [(X86WinFTOL RFP32:$src)]>,
-                    Requires<[In32BitMode]>;
+                    Requires<[Not64BitMode]>;
 
   def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
                       "# win32 fptoui",
                       [(X86WinFTOL RFP64:$src)]>,
-                    Requires<[In32BitMode]>;
+                    Requires<[Not64BitMode]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -172,7 +173,7 @@ let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
   def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
                             "#EH_SJLJ_SETJMP32",
                             [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
-                          Requires<[In32BitMode]>;
+                          Requires<[Not64BitMode]>;
   def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
                             "#EH_SJLJ_SETJMP64",
                             [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
@@ -181,7 +182,7 @@ let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
   def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
                             "#EH_SJLJ_LONGJMP32",
                             [(X86eh_sjlj_longjmp addr:$buf)]>,
-                          Requires<[In32BitMode]>;
+                          Requires<[Not64BitMode]>;
   def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
                             "#EH_SJLJ_LONGJMP64",
                             [(X86eh_sjlj_longjmp addr:$buf)]>,
@@ -219,10 +220,9 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
-// FIXME: Set encoding to pseudo.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1 in
-def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
+    isPseudo = 1 in
+def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
@@ -318,13 +318,13 @@ let SchedRW = [WriteMicrocoded] in {
 let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
 def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
                     [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
-                   Requires<[In32BitMode]>;
+                   Requires<[Not64BitMode]>;
 def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
-                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
-                   Requires<[In32BitMode]>;
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[Not64BitMode]>;
 def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
-                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
-                   Requires<[In32BitMode]>;
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[Not64BitMode]>;
 }
 
 let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
@@ -332,10 +332,10 @@ def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
                     [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
                    Requires<[In64BitMode]>;
 def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
-                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
                    Requires<[In64BitMode]>;
 def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
-                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
                    Requires<[In64BitMode]>;
 def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
                     [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
@@ -347,15 +347,15 @@ let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
   let Uses = [AL,ECX,EDI] in
   def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
                       [(X86rep_stos i8)], IIC_REP_STOS>, REP,
-                     Requires<[In32BitMode]>;
+                     Requires<[Not64BitMode]>;
   let Uses = [AX,ECX,EDI] in
   def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
-                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
-                     Requires<[In32BitMode]>;
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[Not64BitMode]>;
   let Uses = [EAX,ECX,EDI] in
   def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
-                      [(X86rep_stos i32)], IIC_REP_STOS>, REP,
-                     Requires<[In32BitMode]>;
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[Not64BitMode]>;
 }
 
 let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
@@ -365,11 +365,11 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
                      Requires<[In64BitMode]>;
   let Uses = [AX,RCX,RDI] in
   def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
-                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
                      Requires<[In64BitMode]>;
   let Uses = [RAX,RCX,RDI] in
   def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
-                      [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
                      Requires<[In64BitMode]>;
  
   let Uses = [RAX,RCX,RDI] in
@@ -395,11 +395,11 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
                   [(X86tlsaddr tls32addr:$sym)]>,
-                  Requires<[In32BitMode]>;
+                  Requires<[Not64BitMode]>;
 def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_base_addr32",
                   [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
-                  Requires<[In32BitMode]>;
+                  Requires<[Not64BitMode]>;
 }
 
 // All calls clobber the non-callee saved registers. RSP is marked as
@@ -431,7 +431,7 @@ let Defs = [EAX, ECX, EFLAGS],
 def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                 "# TLSCall_32",
                 [(X86TLSCall addr:$sym)]>,
-                Requires<[In32BitMode]>;
+                Requires<[Not64BitMode]>;
 
 // For x86_64, the address of the thunk is passed in %rdi, on return
 // the address of the variable is in %rax.  All other registers are preserved.
@@ -590,7 +590,7 @@ defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
 let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
                       "or{l}\t{$zero, $dst|$dst, $zero}",
-                      [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK,
+                      [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
                     Sched<[WriteALULd, WriteRMW]>;
 
 let hasSideEffects = 1 in
@@ -618,13 +618,13 @@ def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    !strconcat(mnemonic, "{w}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   [], IIC_ALU_NONMEM>, OpSize, LOCK;
+                   [], IIC_ALU_NONMEM>, OpSize16, LOCK;
 def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    !strconcat(mnemonic, "{l}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   [], IIC_ALU_NONMEM>, LOCK;
+                   [], IIC_ALU_NONMEM>, OpSize32, LOCK;
 def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
@@ -644,14 +644,14 @@ def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, OpSize, LOCK;
+                      [], IIC_ALU_MEM>, OpSize16, LOCK;
 
 def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, LOCK;
+                      [], IIC_ALU_MEM>, OpSize32, LOCK;
 
 def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
@@ -665,13 +665,13 @@ def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, OpSize, LOCK;
+                      [], IIC_ALU_MEM>, OpSize16, LOCK;
 def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
-                      [], IIC_ALU_MEM>, LOCK;
+                      [], IIC_ALU_MEM>, OpSize32, LOCK;
 def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
@@ -700,10 +700,10 @@ def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
                  [], IIC_UNARY_MEM>, LOCK;
 def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
                  !strconcat(mnemonic, "{w}\t$dst"),
-                 [], IIC_UNARY_MEM>, OpSize, LOCK;
+                 [], IIC_UNARY_MEM>, OpSize16, LOCK;
 def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
                  !strconcat(mnemonic, "{l}\t$dst"),
-                 [], IIC_UNARY_MEM>, LOCK;
+                 [], IIC_UNARY_MEM>, OpSize32, LOCK;
 def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
                   !strconcat(mnemonic, "{q}\t$dst"),
                   [], IIC_UNARY_MEM>, LOCK;
@@ -735,11 +735,11 @@ let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
   let Defs = [AX, EFLAGS], Uses = [AX] in
   def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
                   !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
-                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
   let Defs = [EAX, EFLAGS], Uses = [EAX] in
   def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
                   !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
-                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
   let Defs = [RAX, EFLAGS], Uses = [RAX] in
   def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
                    !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
@@ -782,14 +782,14 @@ multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
                     [(set
                        GR16:$dst,
                        (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
-                    itin>, OpSize;
+                    itin>, OpSize16;
     def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
                     (ins GR32:$val, i32mem:$ptr),
                     !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
                     [(set
                        GR32:$dst,
                        (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
-                    itin>;
+                    itin>, OpSize32;
     def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
                      (ins GR64:$val, i64mem:$ptr),
                      !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
@@ -1020,22 +1020,22 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In32BitMode]>;
+          Requires<[Not64BitMode]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
 // callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
-          Requires<[In32BitMode, IsNotPIC]>;
+          Requires<[Not64BitMode, IsNotPIC]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
-          Requires<[In32BitMode]>;
+          Requires<[Not64BitMode]>;
 
 def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
-          Requires<[In32BitMode]>;
+          Requires<[Not64BitMode]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
@@ -1304,13 +1304,13 @@ def : Pat<(and GR32:$src1, 0xff),
           (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
                                                              GR32_ABCD)),
                                       sub_8bit))>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
            (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
             (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
              sub_16bit)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 
 // r & (2^32-1) ==> movz
 def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
@@ -1345,13 +1345,13 @@ def : Pat<(sext_inreg GR32:$src, i8),
           (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                              GR32_ABCD)),
                                       sub_8bit))>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 
 def : Pat<(sext_inreg GR16:$src, i8),
            (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
             (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
              sub_16bit)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 
 def : Pat<(sext_inreg GR64:$src, i32),
           (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
@@ -1383,11 +1383,11 @@ def : Pat<(i16 (trunc GR32:$src)),
 def : Pat<(i8 (trunc GR32:$src)),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                           sub_8bit)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
           (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                           sub_8bit)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(i32 (trunc GR64:$src)),
           (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
 def : Pat<(i16 (trunc GR64:$src)),
@@ -1405,38 +1405,38 @@ def : Pat<(i8 (trunc GR16:$src)),
 def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
           (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                           sub_8bit_hi)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                           sub_8bit_hi)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(srl GR16:$src, (i8 8)),
           (EXTRACT_SUBREG
             (MOVZX32rr8
               (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                               sub_8bit_hi)),
             sub_16bit)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
                                                              GR16_ABCD)),
                                       sub_8bit_hi))>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
                                                              GR16_ABCD)),
                                       sub_8bit_hi))>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
           (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                              GR32_ABCD)),
                                       sub_8bit_hi))>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
           (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                              GR32_ABCD)),
                                       sub_8bit_hi))>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 
 // h-register tricks.
 // For now, be conservative on x86-64 and use an h-register extract only if the
@@ -1530,62 +1530,34 @@ def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
 def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
 def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
 
-// (shl x (and y, 31)) ==> (shl x, y)
-def : Pat<(shl GR8:$src1, (and CL, immShift32)),
-          (SHL8rCL GR8:$src1)>;
-def : Pat<(shl GR16:$src1, (and CL, immShift32)),
-          (SHL16rCL GR16:$src1)>;
-def : Pat<(shl GR32:$src1, (and CL, immShift32)),
-          (SHL32rCL GR32:$src1)>;
-def : Pat<(store (shl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SHL8mCL addr:$dst)>;
-def : Pat<(store (shl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SHL16mCL addr:$dst)>;
-def : Pat<(store (shl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SHL32mCL addr:$dst)>;
-
-def : Pat<(srl GR8:$src1, (and CL, immShift32)),
-          (SHR8rCL GR8:$src1)>;
-def : Pat<(srl GR16:$src1, (and CL, immShift32)),
-          (SHR16rCL GR16:$src1)>;
-def : Pat<(srl GR32:$src1, (and CL, immShift32)),
-          (SHR32rCL GR32:$src1)>;
-def : Pat<(store (srl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SHR8mCL addr:$dst)>;
-def : Pat<(store (srl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SHR16mCL addr:$dst)>;
-def : Pat<(store (srl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SHR32mCL addr:$dst)>;
-
-def : Pat<(sra GR8:$src1, (and CL, immShift32)),
-          (SAR8rCL GR8:$src1)>;
-def : Pat<(sra GR16:$src1, (and CL, immShift32)),
-          (SAR16rCL GR16:$src1)>;
-def : Pat<(sra GR32:$src1, (and CL, immShift32)),
-          (SAR32rCL GR32:$src1)>;
-def : Pat<(store (sra (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SAR8mCL addr:$dst)>;
-def : Pat<(store (sra (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SAR16mCL addr:$dst)>;
-def : Pat<(store (sra (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
-          (SAR32mCL addr:$dst)>;
-
-// (shl x (and y, 63)) ==> (shl x, y)
-def : Pat<(shl GR64:$src1, (and CL, immShift64)),
-          (SHL64rCL GR64:$src1)>;
-def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
-          (SHL64mCL addr:$dst)>;
-
-def : Pat<(srl GR64:$src1, (and CL, immShift64)),
-          (SHR64rCL GR64:$src1)>;
-def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
-          (SHR64mCL addr:$dst)>;
-
-def : Pat<(sra GR64:$src1, (and CL, immShift64)),
-          (SAR64rCL GR64:$src1)>;
-def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
-          (SAR64mCL addr:$dst)>;
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+  // (shift x (and y, 31)) ==> (shift x, y)
+  def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+  def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+  def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+  def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+  // (shift x (and y, 63)) ==> (shift x, y)
+  def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+            (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+  def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+            (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
 
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+defm : MaskedShiftAmountPats<rotl, "ROL">;
+defm : MaskedShiftAmountPats<rotr, "ROR">;
 
 // (anyext (setcc_carry)) -> (setcc_carry)
 def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
@@ -1725,17 +1697,17 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
 
 // Increment reg.
 def : Pat<(add GR8 :$src, 1), (INC8r     GR8 :$src)>;
-def : Pat<(add GR16:$src, 1), (INC16r    GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR16:$src, 1), (INC16r    GR16:$src)>, Requires<[Not64BitMode]>;
 def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC32r    GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC32r    GR32:$src)>, Requires<[Not64BitMode]>;
 def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
 def : Pat<(add GR64:$src, 1), (INC64r    GR64:$src)>;
 
 // Decrement reg.
 def : Pat<(add GR8 :$src, -1), (DEC8r     GR8 :$src)>;
-def : Pat<(add GR16:$src, -1), (DEC16r    GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR16:$src, -1), (DEC16r    GR16:$src)>, Requires<[Not64BitMode]>;
 def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC32r    GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC32r    GR32:$src)>, Requires<[Not64BitMode]>;
 def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
 def : Pat<(add GR64:$src, -1), (DEC64r    GR64:$src)>;
 
@@ -1839,3 +1811,9 @@ def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
 def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
 def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
 def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index e4ccc06..39ad395 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -21,42 +21,50 @@
 // ST1 arguments when returning values on the x87 stack.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
-                    "ret",
-                    [(X86retflag 0)], IIC_RET>;
+  def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{l}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    Requires<[Not64BitMode]>;
+  def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{q}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    Requires<[In64BitMode]>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
-                    [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
-                    "ret\t$amt",
-                    [(X86retflag timm:$amt)], IIC_RET_IMM>;
+                    [], IIC_RET>, OpSize16;
+  def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{l}\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+               Requires<[Not64BitMode]>;
+  def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{q}\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+               Requires<[In64BitMode]>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret{w}\t$amt",
-                    [], IIC_RET_IMM>, OpSize;
+                    [], IIC_RET_IMM>, OpSize16;
   def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
-                    "{l}ret{l|f}", [], IIC_RET>;
-  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
-                    "{l}ret{w|f}", [], IIC_RET>, OpSize;
+                    "{l}ret{l|f}", [], IIC_RET>, OpSize32;
   def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
-                    "{l}ret{q|f}", [], IIC_RET>;
-  def LRETI  : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{l|f}\t$amt", [], IIC_RET>;
+                    "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+  def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+  def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
   def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize;
+                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
 }
 
 // Unconditional branches.
 let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
   def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
-                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize32;
+  def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize16,
+                        Requires<[In16BitMode]>;
   let hasSideEffects = 0 in
   def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
                        "jmp\t$dst", [], IIC_JMP_REL>;
-  // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
-  // with JMP_1.
-  let hasSideEffects = 0 in
-  def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
-                       "jmpq\t$dst", [], IIC_JMP_REL>;
 }
 
 // Conditional Branches.
@@ -65,8 +73,12 @@ let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
     let hasSideEffects = 0 in
     def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
                        IIC_Jcc>;
+    def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, OpSize16,
+		       TB, Requires<[In16BitMode]>;
     def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
-                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB;
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB,
+             OpSize32;
   }
 }
 
@@ -94,10 +106,10 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
   // jecxz.
   let Uses = [CX] in
     def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>;
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[Not64BitMode]>;
   let Uses = [ECX] in
     def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                           "jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>;
+                           "jecxz\t$dst", [], IIC_JCXZ>, Requires<[Not64BitMode]>;
 
   // J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
   // In 64-bit mode, the address size prefix is jecxz and the unprefixed version
@@ -112,12 +124,19 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP16r     : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+                     [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize16, Sched<[WriteJump]>;
+  def JMP16m     : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+                     [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+
   def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
-                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>,
-                   Sched<[WriteJump]>;
+                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize32, Sched<[WriteJump]>;
   def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
                      [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
-                   Requires<[In32BitMode]>, Sched<[WriteJumpLd]>;
+                   Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
 
   def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
                      [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
@@ -129,20 +148,20 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
                           (ins i16imm:$off, i16imm:$seg),
                           "ljmp{w}\t{$seg, $off|$off, $seg}", [],
-                          IIC_JMP_FAR_PTR>, OpSize, Sched<[WriteJump]>;
+                          IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
   def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
                           (ins i32imm:$off, i16imm:$seg),
                           "ljmp{l}\t{$seg, $off|$off, $seg}", [],
-                          IIC_JMP_FAR_PTR>, Sched<[WriteJump]>;
+                          IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
   def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
                       "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
                    Sched<[WriteJump]>;
 
   def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
-                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize,
+                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
                    Sched<[WriteJumpLd]>;
   def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
-                     "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+                     "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
                    Sched<[WriteJumpLd]>;
 }
 
@@ -165,38 +184,44 @@ let isCall = 1 in
   let Uses = [ESP] in {
     def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
                            (outs), (ins i32imm_pcrel:$dst),
-                           "call{l}\t$dst", [], IIC_CALL_RI>,
-                      Requires<[In32BitMode]>, Sched<[WriteJump]>;
+                           "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                           (outs), (ins i16imm_pcrel:$dst),
+                           "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+                      Sched<[WriteJump]>;
+    def CALL16r     : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+                        "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+                      OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL16m     : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+                        "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize16,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
     def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                         "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
-                      Requires<[In32BitMode]>, Sched<[WriteJump]>;
+                      OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                         "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
-                        IIC_CALL_MEM>,
-                      Requires<[In32BitMode,FavorMemIndirectCall]>,
+                        IIC_CALL_MEM>, OpSize32,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
                       Sched<[WriteJumpLd]>;
 
     def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
                              (ins i16imm:$off, i16imm:$seg),
                              "lcall{w}\t{$seg, $off|$off, $seg}", [],
-                             IIC_CALL_FAR_PTR>, OpSize, Sched<[WriteJump]>;
+                             IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
     def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
                              (ins i32imm:$off, i16imm:$seg),
                              "lcall{l}\t{$seg, $off|$off, $seg}", [],
-                             IIC_CALL_FAR_PTR>, Sched<[WriteJump]>;
+                             IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
 
     def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
-                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize,
+                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
                       Sched<[WriteJumpLd]>;
     def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
-                        "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>,
+                        "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
                       Sched<[WriteJumpLd]>;
-
-    // callw for 16 bit code for the assembler.
-    let isAsmParserOnly = 1 in
-      def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
-                       (outs), (ins i16imm_pcrel:$dst),
-                       "callw\t$dst", []>, OpSize;
   }
 
 
@@ -240,7 +265,7 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
   // the 32-bit pcrel field that we have.
   def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
                         (outs), (ins i64i32imm_pcrel:$dst),
-                        "call{q}\t$dst", [], IIC_CALL_RI>,
+                        "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)],
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 4090550..6be6a1f 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -14,17 +14,17 @@
 let neverHasSideEffects = 1 in {
   let Defs = [AX], Uses = [AL] in
   def CBW : I<0x98, RawFrm, (outs), (ins),
-              "{cbtw|cbw}", [], IIC_CBW>, OpSize;   // AX = signext(AL)
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize16;  // AX = signext(AL)
   let Defs = [EAX], Uses = [AX] in
   def CWDE : I<0x98, RawFrm, (outs), (ins),
-              "{cwtl|cwde}", [], IIC_CBW>;   // EAX = signext(AX)
+              "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
 
   let Defs = [AX,DX], Uses = [AX] in
   def CWD : I<0x99, RawFrm, (outs), (ins),
-              "{cwtd|cwd}", [], IIC_CBW>, OpSize; // DX:AX = signext(AX)
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
   let Defs = [EAX,EDX], Uses = [EAX] in
   def CDQ : I<0x99, RawFrm, (outs), (ins),
-              "{cltd|cdq}", [], IIC_CBW>; // EDX:EAX = signext(EAX)
+              "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
 
 
   let Defs = [RAX], Uses = [EAX] in
@@ -42,54 +42,54 @@ let neverHasSideEffects = 1 in {
 let neverHasSideEffects = 1 in {
 def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
                    "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
-                   TB, OpSize, Sched<[WriteALU]>;
+                   TB, OpSize16, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
                    "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
-                   TB, OpSize, Sched<[WriteALULd]>;
+                   TB, OpSize16, Sched<[WriteALULd]>;
 } // neverHasSideEffects = 1
 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
-                   Sched<[WriteALU]>;
+                   OpSize32, Sched<[WriteALU]>;
 def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
-                   Sched<[WriteALULd]>;
+                   OpSize32, Sched<[WriteALULd]>;
 def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
                    "movs{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
-                   Sched<[WriteALU]>;
+                   OpSize32, Sched<[WriteALU]>;
 def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movs{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
-                   TB, Sched<[WriteALULd]>;
+                   OpSize32, TB, Sched<[WriteALULd]>;
 
 let neverHasSideEffects = 1 in {
 def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
                    "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
-                   TB, OpSize, Sched<[WriteALU]>;
+                   TB, OpSize16, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
                    "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
-                   TB, OpSize, Sched<[WriteALULd]>;
+                   TB, OpSize16, Sched<[WriteALULd]>;
 } // neverHasSideEffects = 1
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALU]>;
+                   OpSize32, Sched<[WriteALU]>;
 def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALULd]>;
+                   OpSize32, Sched<[WriteALULd]>;
 def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
                    "movz{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALU]>;
+                   OpSize32, Sched<[WriteALU]>;
 def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movz{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
-                   TB, Sched<[WriteALULd]>;
+                   TB, OpSize32, Sched<[WriteALULd]>;
 
 // These are the same as the regular MOVZX32rr8 and MOVZX32rm8
 // except that they use GR32_NOREX for the output operand register class
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 69cd5a5..df6c9da 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -20,7 +20,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                     PatFrag MemFrag128, PatFrag MemFrag256,
                     ValueType OpVT128, ValueType OpVT256,
                     SDPatternOperator Op = null_frag> {
-  let isCommutable = 1 in
+  let usesCustomInserter = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -36,7 +36,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
-  let isCommutable = 1 in
+  let usesCustomInserter = 1 in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
                    !strconcat(OpcodeStr,
@@ -59,6 +59,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  let isCommutable = 1 in
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, "213", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
@@ -66,6 +67,7 @@ let neverHasSideEffects = 1 in {
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, "132", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  let isCommutable = 1 in
   defm r231 : fma3p_rm<opc231,
                        !strconcat(OpcodeStr, "231", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
@@ -118,13 +120,14 @@ let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                     RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
                     SDPatternOperator OpNode = null_frag> {
-  let isCommutable = 1 in
+  let usesCustomInserter = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+
   let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
@@ -134,51 +137,49 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                      (OpVT (OpNode RC:$src2, RC:$src1,
                             (mem_frag addr:$src3))))]>;
 }
-
-multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
-                        ComplexPattern mem_cpat, Intrinsic IntId,
-                        RegisterClass RC> {
-  let isCommutable = 1 in
-  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
-                   !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1,
-                     VR128:$src3))]>;
-  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, memop:$src3),
-                   !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst,
-                     (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
-}
 } // Constraints = "$src1 = $dst"
 
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpStr, string PackTy, Intrinsic Int,
+                       string OpStr, string PackTy, string PT2, Intrinsic Int,
                        SDNode OpNode, RegisterClass RC, ValueType OpVT,
                        X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
                        ComplexPattern mem_cpat> {
 let neverHasSideEffects = 1 in {
   defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
+  let isCommutable = 1 in
   defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
 }
 
+let isCommutable = 1 in
 defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                     x86memop, RC, OpVT, mem_frag, OpNode>,
-            fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
-                         memop, mem_cpat, Int, RC>;
+                     x86memop, RC, OpVT, mem_frag, OpNode>;
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                  string OpStr, Intrinsic IntF32, Intrinsic IntF64,
                  SDNode OpNode> {
-  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", IntF32, OpNode,
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
                         FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
-  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", IntF64, OpNode,
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
                         FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+
+  def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+            (COPY_TO_REGCLASS
+              (!cast<Instruction>(NAME#"SSr213r")
+                (COPY_TO_REGCLASS $src2, FR32),
+                (COPY_TO_REGCLASS $src1, FR32),
+                (COPY_TO_REGCLASS $src3, FR32)),
+              VR128)>;
+
+  def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+            (COPY_TO_REGCLASS
+              (!cast<Instruction>(NAME#"SDr213r")
+                (COPY_TO_REGCLASS $src2, FR64),
+                (COPY_TO_REGCLASS $src1, FR64),
+                (COPY_TO_REGCLASS $src3, FR64)),
+              VR128)>;
 }
 
 defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
@@ -220,7 +221,7 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
            [(set RC:$dst,
              (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
 // For disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
@@ -230,6 +231,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                      ComplexPattern mem_cpat, Intrinsic Int> {
+let isCodeGenOnly = 1 in {
   let isCommutable = 1 in
   def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -249,6 +251,7 @@ multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
                  (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+} // isCodeGenOnly = 1
 }
 
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -295,7 +298,7 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
            [(set VR256:$dst, (OpNode VR256:$src1,
                               (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
 // For disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 7c37888..4ad7b7e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -206,74 +206,91 @@ def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
 }
 
 let Defs = [FPSW] in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
 defm ADD : FPBinary_rr<fadd>;
 defm SUB : FPBinary_rr<fsub>;
 defm MUL : FPBinary_rr<fmul>;
 defm DIV : FPBinary_rr<fdiv>;
+// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+let SchedRW = [WriteFAddLd] in {
 defm ADD : FPBinary<fadd, MRM0m, "add">;
 defm SUB : FPBinary<fsub, MRM4m, "sub">;
 defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+}
+let SchedRW = [WriteFMulLd] in {
 defm MUL : FPBinary<fmul, MRM1m, "mul">;
+}
+let SchedRW = [WriteFDivLd] in {
 defm DIV : FPBinary<fdiv, MRM6m, "div">;
 defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
 }
+}
 
-class FPST0rInst<bits<8> o, string asm>
-  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
-class FPrST0Inst<bits<8> o, string asm>
-  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
-class FPrST0PInst<bits<8> o, string asm>
-  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+class FPST0rInst<Format fp, string asm>
+  : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+  : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+  : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
 
 // NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
 // of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
 // we have to put some 'r's in and take them out of weird places.
-def ADD_FST0r   : FPST0rInst <0xC0, "fadd\t$op">;
-def ADD_FrST0   : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">;
-def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp\t$op">;
-def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr\t$op">;
-def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
-def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
-def SUB_FST0r   : FPST0rInst <0xE0, "fsub\t$op">;
-def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
-def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
-def MUL_FST0r   : FPST0rInst <0xC8, "fmul\t$op">;
-def MUL_FrST0   : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">;
-def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp\t$op">;
-def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr\t$op">;
-def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
-def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
-def DIV_FST0r   : FPST0rInst <0xF0, "fdiv\t$op">;
-def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
-def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
-
-def COM_FST0r   : FPST0rInst <0xD0, "fcom\t$op">;
-def COMP_FST0r  : FPST0rInst <0xD8, "fcomp\t$op">;
+let SchedRW = [WriteFAdd] in {
+def ADD_FST0r   : FPST0rInst <MRM0r, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
+def ADD_FPrST0  : FPrST0PInst<MRM0r, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <MRM5r, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
+def SUB_FPrST0  : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <MRM4r, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul] in {
+def MUL_FST0r   : FPST0rInst <MRM1r, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
+def MUL_FPrST0  : FPrST0PInst<MRM1r, "fmulp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFDiv] in {
+def DIVR_FST0r  : FPST0rInst <MRM7r, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
+def DIV_FPrST0  : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <MRM6r, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+} // SchedRW
+
+def COM_FST0r   : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r  : FPST0rInst <MRM3r, "fcomp\t$op">;
 
 // Unary operations.
-multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
 def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
                  [(set RFP32:$dst, (OpNode RFP32:$src))]>;
 def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
                  [(set RFP64:$dst, (OpNode RFP64:$src))]>;
 def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
                  [(set RFP80:$dst, (OpNode RFP80:$src))]>;
-def _F     : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+def _F     : FPI<0xD9, fp, (outs), (ins), asmstring>;
 }
 
 let Defs = [FPSW] in {
-defm CHS : FPUnary<fneg, 0xE0, "fchs">;
-defm ABS : FPUnary<fabs, 0xE1, "fabs">;
-defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
-defm SIN : FPUnary<fsin, 0xFE, "fsin">;
-defm COS : FPUnary<fcos, 0xFF, "fcos">;
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+let SchedRW = [WriteFSqrt] in {
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+}
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
 
 let neverHasSideEffects = 1 in {
 def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
 def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
 def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
 }
-def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
 } // Defs = [FPSW]
 
 // Versions of FP instructions that take a single memory operand.  Added for the
@@ -336,22 +353,22 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
 
 let Predicates = [HasCMov] in {
 // These are not factored because there's no clean way to pass DA/DB.
-def CMOVB_F  : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovb\t{$op, %st(0)|st(0), $op}">, DA;
-def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA;
-def CMOVE_F  : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmove\t{$op, %st(0)|st(0), $op}">, DA;
-def CMOVP_F  : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovu\t{$op, %st(0)|st(0), $op}">, DA;
-def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB;
-def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB;
-def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovne\t{$op, %st(0)|st(0), $op}">, DB;
-def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB;
+def CMOVB_F  : FPI<0xDA, MRM0r, (outs RST:$op), (ins),
+                  "fcmovb\t{$op, %st(0)|st(0), $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs RST:$op), (ins),
+                  "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVE_F  : FPI<0xDA, MRM1r, (outs RST:$op), (ins),
+                  "fcmove\t{$op, %st(0)|st(0), $op}">;
+def CMOVP_F  : FPI<0xDA, MRM3r, (outs RST:$op), (ins),
+                  "fcmovu\t{$op, %st(0)|st(0), $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs RST:$op), (ins),
+                  "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs RST:$op), (ins),
+                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs RST:$op), (ins),
+                  "fcmovne\t{$op, %st(0)|st(0), $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs RST:$op), (ins),
+                  "fcmovnu\t{$op, %st(0)|st(0), $op}">;
 } // Predicates = [HasCMov]
 
 // Floating point loads & stores.
@@ -492,14 +509,10 @@ def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
 
 // FP Stack manipulation instructions.
 let SchedRW = [WriteMove] in {
-def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
-                   IIC_FLD>, D9;
-def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
-                   IIC_FST>, DD;
-def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
-                   IIC_FST>, DD;
-def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
-                   IIC_FXCH>, D9;
+def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
+def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
+def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
+def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
 }
 
 // Floating point constant loads.
@@ -519,8 +532,8 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
 }
 
 let SchedRW = [WriteZero] in {
-def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
-def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
 }
 
 // Floating point compares.
@@ -546,40 +559,35 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
 }
 
 let Defs = [FPSW], Uses = [ST0] in {
-def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
-                    (outs), (ins RST:$reg),
-                    "fucom\t$reg", IIC_FUCOM>, DD;
-def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
-                    (outs), (ins RST:$reg),
-                    "fucomp\t$reg", IIC_FUCOM>, DD;
-def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
-                    (outs), (ins),
-                    "fucompp", IIC_FUCOM>, DA;
+def UCOM_Fr    : FPI<0xDD, MRM4r,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+def UCOM_FPr   : FPI<0xDD, MRM5r,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+def UCOM_FPPr  : FPI<0xDA, MRM_E9,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins), "fucompp", IIC_FUCOM>;
 }
 
 let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
-def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
-                    (outs), (ins RST:$reg),
-                    "fucomi\t$reg", IIC_FUCOMI>, DB;
-def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
-                    (outs), (ins RST:$reg),
-                    "fucompi\t$reg", IIC_FUCOMI>, DF;
+def UCOM_FIr   : FPI<0xDB, MRM5r,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+def UCOM_FIPr  : FPI<0xDF, MRM5r,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
 }
 
 let Defs = [EFLAGS, FPSW] in {
-def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
-                  "fcomi\t$reg", IIC_FCOMI>, DB;
-def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
-                   "fcompi\t$reg", IIC_FCOMI>, DF;
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
+                  "fcomi\t$reg", IIC_FCOMI>;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
+                   "fcompi\t$reg", IIC_FCOMI>;
 }
 } // SchedRW
 
 // Floating point flag ops.
 let SchedRW = [WriteALU] in {
 let Defs = [AX], Uses = [FPSW] in
-def FNSTSW16r : I<0xE0, RawFrm,                  // AX = fp flags
+def FNSTSW16r : I<0xDF, MRM_E0,                  // AX = fp flags
                   (outs), (ins), "fnstsw\t{%ax|ax}",
-                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
 
 def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
                   (outs), (ins i16mem:$dst), "fnstcw\t$dst",
@@ -593,50 +601,50 @@ def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
 // FPU control instructions
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [FPSW] in
-def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
-def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
-                "ffree\t$reg", IIC_FFREE>, DD;
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
+                "ffree\t$reg", IIC_FFREE>;
 // Clear exceptions
 
 let Defs = [FPSW] in
-def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
 } // SchedRW
 
 // Operandless floating-point instructions for the disassembler.
 let SchedRW = [WriteMicrocoded] in {
 def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
 
-def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
-def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9;
-def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9;
-def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9;
-def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9;
-def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9;
-def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9;
-def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9;
-def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9;
-def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9;
-def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9;
-def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9;
-def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9;
-def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9;
-def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9;
-def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9;
-def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9;
-def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9;
-def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9;
-def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9;
-def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE;
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
 
 def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
                "fxsave\t$dst", [], IIC_FXSAVE>, TB;
-def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
-                 "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W,
-                 Requires<[In64BitMode]>;
+def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+                  "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, 
+                  Requires<[In64BitMode]>;
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
                 "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
-def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                  "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
+def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                  "fxrstor{q|64}\t$src", [], IIC_FXRSTOR>, TB,
                   Requires<[In64BitMode]>;
 } // SchedRW
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 0fd9011..cc30266 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -14,54 +14,48 @@
 // Format specifies the encoding used by the instruction.  This is part of the
 // ad-hoc solution used to emit machine instruction encodings by our machine
 // code emitter.
-class Format<bits<6> val> {
-  bits<6> Value = val;
+class Format<bits<7> val> {
+  bits<7> Value = val;
 }
 
 def Pseudo     : Format<0>; def RawFrm     : Format<1>;
 def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
 def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
-def MRMSrcMem  : Format<6>;
+def MRMSrcMem  : Format<6>; def RawFrmMemOffs : Format<7>;
+def RawFrmSrc  : Format<8>; def RawFrmDst     : Format<9>;
+def RawFrmDstSrc: Format<10>;
+def RawFrmImm8 : Format<11>;
+def RawFrmImm16 : Format<12>;
+def MRMXr  : Format<14>; def MRMXm  : Format<15>;
 def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
 def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
 def MRM6r  : Format<22>; def MRM7r  : Format<23>;
 def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
 def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
 def MRM6m  : Format<30>; def MRM7m  : Format<31>;
-def MRMInitReg : Format<32>;
-def MRM_C1 : Format<33>;
-def MRM_C2 : Format<34>;
-def MRM_C3 : Format<35>;
-def MRM_C4 : Format<36>;
-def MRM_C8 : Format<37>;
-def MRM_C9 : Format<38>;
-def MRM_CA : Format<39>;
-def MRM_CB : Format<40>;
-def MRM_E8 : Format<41>;
-def MRM_F0 : Format<42>;
-def RawFrmImm8 : Format<43>;
-def RawFrmImm16 : Format<44>;
-def MRM_F8 : Format<45>;
-def MRM_F9 : Format<46>;
-def MRM_D0 : Format<47>;
-def MRM_D1 : Format<48>;
-def MRM_D4 : Format<49>;
-def MRM_D5 : Format<50>;
-def MRM_D6 : Format<51>;
-def MRM_D8 : Format<52>;
-def MRM_D9 : Format<53>;
-def MRM_DA : Format<54>;
-def MRM_DB : Format<55>;
-def MRM_DC : Format<56>;
-def MRM_DD : Format<57>;
-def MRM_DE : Format<58>;
-def MRM_DF : Format<59>;
+def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>;
+def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>;
+def MRM_D0 : Format<41>; def MRM_D1 : Format<42>; def MRM_D4 : Format<43>;
+def MRM_D5 : Format<44>; def MRM_D6 : Format<45>; def MRM_D8 : Format<46>;
+def MRM_D9 : Format<47>; def MRM_DA : Format<48>; def MRM_DB : Format<49>;
+def MRM_DC : Format<50>; def MRM_DD : Format<51>; def MRM_DE : Format<52>;
+def MRM_DF : Format<53>; def MRM_E0 : Format<54>; def MRM_E1 : Format<55>;
+def MRM_E2 : Format<56>; def MRM_E3 : Format<57>; def MRM_E4 : Format<58>;
+def MRM_E5 : Format<59>; def MRM_E8 : Format<60>; def MRM_E9 : Format<61>;
+def MRM_EA : Format<62>; def MRM_EB : Format<63>; def MRM_EC : Format<64>;
+def MRM_ED : Format<65>; def MRM_EE : Format<66>; def MRM_F0 : Format<67>;
+def MRM_F1 : Format<68>; def MRM_F2 : Format<69>; def MRM_F3 : Format<70>;
+def MRM_F4 : Format<71>; def MRM_F5 : Format<72>; def MRM_F6 : Format<73>;
+def MRM_F7 : Format<74>; def MRM_F8 : Format<75>; def MRM_F9 : Format<76>;
+def MRM_FA : Format<77>; def MRM_FB : Format<78>; def MRM_FC : Format<79>;
+def MRM_FD : Format<80>; def MRM_FE : Format<81>; def MRM_FF : Format<82>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
 // machine code emitter.
-class ImmType<bits<3> val> {
-  bits<3> Value = val;
+class ImmType<bits<4> val> {
+  bits<4> Value = val;
 }
 def NoImm      : ImmType<0>;
 def Imm8       : ImmType<1>;
@@ -70,7 +64,8 @@ def Imm16      : ImmType<3>;
 def Imm16PCRel : ImmType<4>;
 def Imm32      : ImmType<5>;
 def Imm32PCRel : ImmType<6>;
-def Imm64      : ImmType<7>;
+def Imm32S     : ImmType<7>;
+def Imm64      : ImmType<8>;
 
 // FPFormat - This specifies what form this FP instruction has.  This is used by
 // the Floating-Point stackifier pass.
@@ -110,48 +105,84 @@ def CD8VT2 : CD8VForm<5>;  // v := 2
 def CD8VT4 : CD8VForm<6>;  // v := 4
 def CD8VT8 : CD8VForm<7>;  // v := 8
 
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PS     : Prefix<1>;
+def PD     : Prefix<2>;
+def XS     : Prefix<3>;
+def XD     : Prefix<4>;
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+  bits<3> Value = val;
+}
+def OB   : Map<0>;
+def TB   : Map<1>;
+def T8   : Map<2>;
+def TA   : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+  bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX    : Encoding<1>;
+def EncXOP    : Encoding<2>;
+def EncEVEX   : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16    : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32    : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
 // Prefix byte classes which are used to indicate to the ad-hoc machine code
 // emitter that various prefix bytes are required.
-class OpSize { bit hasOpSizePrefix = 1; }
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
 class AdSize { bit hasAdSizePrefix = 1; }
 class REX_W  { bit hasREX_WPrefix = 1; }
 class LOCK   { bit hasLockPrefix = 1; }
-class SegFS  { bits<2> SegOvrBits = 1; }
-class SegGS  { bits<2> SegOvrBits = 2; }
-class TB     { bits<5> Prefix = 1; }
-class REP    { bits<5> Prefix = 2; }
-class D8     { bits<5> Prefix = 3; }
-class D9     { bits<5> Prefix = 4; }
-class DA     { bits<5> Prefix = 5; }
-class DB     { bits<5> Prefix = 6; }
-class DC     { bits<5> Prefix = 7; }
-class DD     { bits<5> Prefix = 8; }
-class DE     { bits<5> Prefix = 9; }
-class DF     { bits<5> Prefix = 10; }
-class XD     { bits<5> Prefix = 11; }
-class XS     { bits<5> Prefix = 12; }
-class T8     { bits<5> Prefix = 13; }
-class TA     { bits<5> Prefix = 14; }
-class A6     { bits<5> Prefix = 15; }
-class A7     { bits<5> Prefix = 16; }
-class T8XD   { bits<5> Prefix = 17; }
-class T8XS   { bits<5> Prefix = 18; }
-class TAXD   { bits<5> Prefix = 19; }
-class XOP8   { bits<5> Prefix = 20; }
-class XOP9   { bits<5> Prefix = 21; }
-class XOPA   { bits<5> Prefix = 22; }
-class VEX    { bit hasVEXPrefix = 1; }
+class REP    { bit hasREPPrefix = 1; }
+class TB     { Map OpMap = TB; }
+class T8     { Map OpMap = T8; }
+class TA     { Map OpMap = TA; }
+class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class OBXS   { Prefix OpPrefix = XS; }
+class PS   : TB { Prefix OpPrefix = PS; }
+class PD   : TB { Prefix OpPrefix = PD; }
+class XD   : TB { Prefix OpPrefix = XD; }
+class XS   : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class VEX    { Encoding OpEnc = EncVEX; }
 class VEX_W  { bit hasVEX_WPrefix = 1; }
-class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
-class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; }
 class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
-class EVEX : VEX { bit hasEVEXPrefix = 1; }
-class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; }
+class EVEX : VEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
 class EVEX_K { bit hasEVEX_K = 1; }
 class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
 class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
 class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
 class EVEX_CD8<int esize, CD8VForm form> {
   bits<2> EVEX_CD8E = !if(!eq(esize, 8),  0b00,
@@ -162,7 +193,10 @@ class EVEX_CD8<int esize, CD8VForm form> {
 }
 class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 class MemOp4 { bit hasMemOp4Prefix = 1; }
-class XOP { bit hasXOP_Prefix = 1; }
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; }
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr,
               InstrItinClass itin,
@@ -172,7 +206,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
 
   bits<8> Opcode = opcod;
   Format Form = f;
-  bits<6> FormBits = Form.Value;
+  bits<7> FormBits = Form.Value;
   ImmType ImmT = i;
 
   dag OutOperandList = outs;
@@ -187,25 +221,34 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   //
   // Attributes specific to X86 instructions...
   //
-  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+                            // isCodeGenonly. Needed to hide an ambiguous
+                            // AsmString from the parser, but still disassemble.
+
+  OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+                                    // based on operand size of the mode
+  bits<2> OpSizeBits = OpSize.Value;
   bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
 
-  bits<5> Prefix = 0;       // Which prefix byte does this inst have?
+  Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+  bits<3> OpPrefixBits = OpPrefix.Value;
+  Map OpMap = OB;           // Which opcode map does this inst have?
+  bits<3> OpMapBits = OpMap.Value;
   bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
   FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
-  bits<2> SegOvrBits = 0;   // Segment override prefix.
   Domain ExeDomain = d;
-  bit hasVEXPrefix = 0;     // Does this inst require a VEX prefix?
+  bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
+  Encoding OpEnc = EncNormal; // Encoding used by this instruction
+  bits<2> OpEncBits = OpEnc.Value;
   bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
-  bit hasVEX_4VPrefix = 0;  // Does this inst require the VEX.VVVV field?
-  bit hasVEX_4VOp3Prefix = 0;  // Does this inst require the VEX.VVVV field to
-                               // encode the third operand?
+  bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
+  bit hasVEX_4VOp3 = 0;     // Does this inst require the VEX.VVVV field to
+                            // encode the third operand?
   bit hasVEX_i8ImmReg = 0;  // Does this inst require the last source register
                             // to be encoded in a immediate field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
   bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
-  bit hasEVEXPrefix = 0;    // Does this inst require EVEX form?
   bit hasEVEX_K = 0;        // Does this inst require masking?
   bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
   bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
@@ -214,37 +257,37 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bits<3> EVEX_CD8V = 0;    // Compressed disp8 form - vector-width.
   bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
   bit hasMemOp4Prefix = 0;  // Same bit as VEX_W, but used for swapping operands
-  bit hasXOP_Prefix = 0;    // Does this inst require an XOP prefix?
+  bit hasEVEX_RC = 0;       // Explicitly specified rounding control in FP instruction.
 
   // TSFlags layout should be kept in sync with X86InstrInfo.h.
-  let TSFlags{5-0}   = FormBits;
-  let TSFlags{6}     = hasOpSizePrefix;
-  let TSFlags{7}     = hasAdSizePrefix;
-  let TSFlags{12-8}  = Prefix;
-  let TSFlags{13}    = hasREX_WPrefix;
-  let TSFlags{16-14} = ImmT.Value;
-  let TSFlags{19-17} = FPForm.Value;
-  let TSFlags{20}    = hasLockPrefix;
-  let TSFlags{22-21} = SegOvrBits;
-  let TSFlags{24-23} = ExeDomain.Value;
-  let TSFlags{32-25} = Opcode;
-  let TSFlags{33}    = hasVEXPrefix;
-  let TSFlags{34}    = hasVEX_WPrefix;
-  let TSFlags{35}    = hasVEX_4VPrefix;
-  let TSFlags{36}    = hasVEX_4VOp3Prefix;
-  let TSFlags{37}    = hasVEX_i8ImmReg;
-  let TSFlags{38}    = hasVEX_L;
-  let TSFlags{39}    = ignoresVEX_L;
-  let TSFlags{40}    = hasEVEXPrefix;
-  let TSFlags{41}    = hasEVEX_K;
-  let TSFlags{42}    = hasEVEX_Z;
-  let TSFlags{43}    = hasEVEX_L2;
-  let TSFlags{44}    = hasEVEX_B;
-  let TSFlags{46-45} = EVEX_CD8E;
-  let TSFlags{49-47} = EVEX_CD8V;
-  let TSFlags{50}    = has3DNow0F0FOpcode;
-  let TSFlags{51}    = hasMemOp4Prefix;
-  let TSFlags{52}    = hasXOP_Prefix;
+  let TSFlags{6-0}   = FormBits;
+  let TSFlags{8-7}   = OpSizeBits;
+  let TSFlags{9}     = hasAdSizePrefix;
+  let TSFlags{12-10} = OpPrefixBits;
+  let TSFlags{15-13} = OpMapBits;
+  let TSFlags{16}    = hasREX_WPrefix;
+  let TSFlags{20-17} = ImmT.Value;
+  let TSFlags{23-21} = FPForm.Value;
+  let TSFlags{24}    = hasLockPrefix;
+  let TSFlags{25}    = hasREPPrefix;
+  let TSFlags{27-26} = ExeDomain.Value;
+  let TSFlags{29-28} = OpEncBits;
+  let TSFlags{37-30} = Opcode;
+  let TSFlags{38}    = hasVEX_WPrefix;
+  let TSFlags{39}    = hasVEX_4V;
+  let TSFlags{40}    = hasVEX_4VOp3;
+  let TSFlags{41}    = hasVEX_i8ImmReg;
+  let TSFlags{42}    = hasVEX_L;
+  let TSFlags{43}    = ignoresVEX_L;
+  let TSFlags{44}    = hasEVEX_K;
+  let TSFlags{45}    = hasEVEX_Z;
+  let TSFlags{46}    = hasEVEX_L2;
+  let TSFlags{47}    = hasEVEX_B;
+  let TSFlags{49-48} = EVEX_CD8E;
+  let TSFlags{52-50} = EVEX_CD8V;
+  let TSFlags{53}    = has3DNow0F0FOpcode;
+  let TSFlags{54}    = hasMemOp4Prefix;
+  let TSFlags{55}    = hasEVEX_RC;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -284,6 +327,12 @@ class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
   let Pattern = pattern;
   let CodeSize = 3;
 }
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
 
 class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -333,73 +382,83 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
-def __xs : XS;
-def __xd : XD;
-
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
-                   !if(hasVEXPrefix /* VEX */, [UseAVX],
-                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1],
-                   !if(!eq(Prefix, __xd.Prefix), [UseSSE2],
-                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))));
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])))));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
 }
 
 // SIi8 - SSE 1 & 2 scalar instructions
 class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
-                   !if(hasVEXPrefix /* VEX */, [UseAVX],
-                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])));
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   [UseSSE2])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
 }
 
 // PI - SSE 1 & 2 packed instructions
 class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
-                   !if(hasVEXPrefix /* VEX */, [HasAVX],
-                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
 }
 
 // MMXPI - SSE 1 & 2 packed instructions with MMX operands
 class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
             InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+  let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
+                       [HasSSE1]);
 }
 
 // PIi8 - SSE 1 & 2 packed instructions with immediate
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
-                   !if(hasVEXPrefix /* VEX */, [HasAVX],
-                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
 }
 
 // SSE1 Instruction Templates:
 // 
 //   SSI   - SSE1 instructions with XS prefix.
-//   PSI   - SSE1 instructions with TB prefix.
-//   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+//   PSI   - SSE1 instructions with PS prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
 //   VSSI  - SSE1 instructions with XS prefix in AVX form.
-//   VPSI  - SSE1 instructions with TB prefix in AVX form, packed single.
+//   VPSI  - SSE1 instructions with PS prefix in AVX form, packed single.
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -409,11 +468,11 @@ class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
         Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
         Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -421,7 +480,7 @@ class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX]>;
 class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
         Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
@@ -430,13 +489,13 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
 //   S2SI   - SSE2 instructions with XS prefix.
 //   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
-//   PDI    - SSE2 instructions with TB and OpSize prefixes, packed double domain.
-//   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+//   PDI    - SSE2 instructions with PD prefix, packed double domain.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and PD prefix.
 //   VSDI   - SSE2 scalar instructions with XD prefix in AVX form.
-//   VPDI   - SSE2 vector instructions with TB and OpSize prefixes in AVX form,
+//   VPDI   - SSE2 vector instructions with PD prefix in AVX form,
 //                 packed double domain.
-//   VS2I   - SSE2 scalar instructions with TB and OpSize prefixes in AVX form.
-//   S2I    - SSE2 scalar instructions with TB and OpSize prefixes.
+//   VS2I   - SSE2 scalar instructions with PD prefix in AVX form.
+//   S2I    - SSE2 scalar instructions with PD prefix.
 //   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
 //               MMX operands.
 //   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
@@ -456,11 +515,11 @@ class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
         Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
         Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -472,16 +531,15 @@ class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX]>;
 class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
-        OpSize, Requires<[HasAVX]>;
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+        PD, Requires<[HasAVX]>;
 class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
-        OpSize, Requires<[UseAVX]>;
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+        Requires<[UseAVX]>;
 class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, TB,
-        OpSize, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
 class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
@@ -491,7 +549,7 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 // SSE3 Instruction Templates:
 // 
-//   S3I   - SSE3 instructions with TB and OpSize prefixes.
+//   S3I   - SSE3 instructions with PD prefixes.
 //   S3SI  - SSE3 instructions with XS prefix.
 //   S3DI  - SSE3 instructions with XD prefix.
 
@@ -505,7 +563,7 @@ class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
         Requires<[UseSSE3]>;
 
 
@@ -522,19 +580,19 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[UseSSSE3]>;
 class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
         Requires<[HasSSSE3]>;
 class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
         Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -544,11 +602,11 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
@@ -556,7 +614,7 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[UseSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with T8XD prefix.
@@ -568,53 +626,53 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[UseSSE42]>;
 
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)
 //
-//   AVX8I - AVX instructions with T8 and OpSize prefix.
-//   AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8.
+//   AVX8I - AVX instructions with T8PD prefix.
+//   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
 class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[HasAVX]>;
 class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[HasAVX]>;
 
 // AVX2 Instruction Templates:
 //   Instructions introduced in AVX2 (no SSE equivalent forms)
 //
-//   AVX28I - AVX2 instructions with T8 and OpSize prefix.
-//   AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8.
+//   AVX28I - AVX2 instructions with T8PD prefix.
+//   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
 class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[HasAVX2]>;
 class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[HasAVX2]>;
 
 
 // AVX-512 Instruction Templates:
 //   Instructions introduced in AVX-512 (no SSE equivalent forms)
 //
-//   AVX5128I - AVX-512 instructions with T8 and OpSize prefix.
-//   AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8.
-//   AVX512PDI  - AVX-512 instructions with TB, OpSize, double packed.
-//   AVX512PSI  - AVX-512 instructions with TB, single packed.
+//   AVX5128I - AVX-512 instructions with T8PD prefix.
+//   AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX512PDI  - AVX-512 instructions with PD, double packed.
+//   AVX512PSI  - AVX-512 instructions with PS, single packed.
 //   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
 //   AVX512XSI  - AVX-512 instructions with XS prefix, generic domain.
-//   AVX512BI   - AVX-512 instructions with TB, OpSize, int packed domain.
-//   AVX512SI   - AVX-512 scalar instructions with TB and OpSize prefixes.
+//   AVX512BI   - AVX-512 instructions with PD, int packed domain.
+//   AVX512SI   - AVX-512 scalar instructions with PD prefix.
 
 class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[HasAVX512]>;
 class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -630,42 +688,38 @@ class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
         Requires<[HasAVX512]>;
 class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
-        Requires<[HasAVX512]>;
-class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
         Requires<[HasAVX512]>;
 class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[HasAVX512]>;
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB,
-      Requires<[HasAVX512]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+        Requires<[HasAVX512]>;
 class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB,
-        OpSize, Requires<[HasAVX512]>;
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[HasAVX512]>;
 class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
 class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
+      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
 class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8,
-        OpSize, EVEX_4V, Requires<[HasAVX512]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        EVEX_4V, Requires<[HasAVX512]>;
 
 // AES Instruction Templates:
 //
@@ -673,54 +727,54 @@ class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern, InstrItinClass itin = IIC_AES>
-      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
         Requires<[HasAES]>;
 
 class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[HasAES]>;
 
 // PCLMUL Instruction Templates
 class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, Requires<[HasPCLMUL]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasPCLMUL]>;
 
 class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                   list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
 
 // FMA3 Instruction Templates
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, T8,
-        OpSize, VEX_4V, FMASC, Requires<[HasFMA]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        VEX_4V, FMASC, Requires<[HasFMA]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, TA,
-        OpSize, VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD,
+        VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
 
 // XOP 2, 3 and 4 Operand Instruction Template
 class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
-         XOP, XOP9, Requires<[HasXOP]>;
+         XOP9, Requires<[HasXOP]>;
 
 // XOP 2, 3 and 4 Operand Instruction Templates with imm byte
 class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
-         XOP, XOP8, Requires<[HasXOP]>;
+         XOP8, Requires<[HasXOP]>;
 
 //  XOP 5 operand instruction (VEX encoding!)
 class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
 
 // X86-64 Instruction templates...
 //
@@ -731,9 +785,15 @@ class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
 class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
 
 class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -749,18 +809,6 @@ class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
-class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
-class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
-class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
-class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
-           list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
 class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
@@ -774,29 +822,29 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // MMXI   - MMX instructions with TB prefix.
 // MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
 // MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
-// MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
-// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
-// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMX2I  - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
 // MMXID  - MMX instructions with XD prefix.
 // MMXIS  - MMX instructions with XS prefix.
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
 class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, 
              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In32BitMode]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
 class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
 class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, 
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
 class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, 
             list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
 class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
              list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
 class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, 
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1fed424..486e5a9 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -25,7 +25,8 @@ def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
                                             SDTCisFP<0>, SDTCisInt<2> ]>;
 def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<1>, SDTCisVT<3, i8>]>;
+                                       SDTCisFP<1>, SDTCisVT<3, i8>,
+                                       SDTCisVec<1>]>;
 
 def X86umin    : SDNode<"X86ISD::UMIN",      SDTIntBinOp>;
 def X86umax    : SDNode<"X86ISD::UMAX",      SDTIntBinOp>;
@@ -59,8 +60,8 @@ def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
-def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
-def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
+def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
+//def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
@@ -86,38 +87,40 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
 
-def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
-                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                      SDTCisOpSmallerThanOp<1, 0> ]>>;
-
-def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
-                 SDTypeProfile<1, 1,
-                 [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
-
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
 def X86vzext   : SDNode<"X86ISD::VZEXT",
                          SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
 
 def X86vsext   : SDNode<"X86ISD::VSEXT",
                          SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
 
 def X86vtrunc   : SDNode<"X86ISD::VTRUNC",
                          SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86trunc    : SDNode<"X86ISD::TRUNC",
+                         SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<0, 1>]>>;
+
 def X86vtruncm   : SDNode<"X86ISD::VTRUNCM",
                          SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                               SDTCisInt<0>, SDTCisInt<1>,
-                                              SDTCisVec<2>, SDTCisInt<2>]>>;
+                                              SDTCisVec<2>, SDTCisInt<2>,
+                                              SDTCisOpSmallerThanOp<0, 2>]>>;
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCisFP<0>, SDTCisFP<1>]>>;
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<1, 0>]>>;
 def X86vfpround: SDNode<"X86ISD::VFPROUND",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
-                                             SDTCisFP<0>, SDTCisFP<1>]>>;
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<0, 1>]>>;
 
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
 def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
@@ -130,9 +133,15 @@ def X86IntCmpMask : SDTypeProfile<1, 2,
 def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
 def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
 
-def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCC :
+      SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<0>, SDTCisVec<1>,
+                           SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCCScalar :
+      SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
 def X86cmpm   : SDNode<"X86ISD::CMPM",    X86CmpMaskCC>;
 def X86cmpmu  : SDNode<"X86ISD::CMPMU",   X86CmpMaskCC>;
+def X86cmpms  : SDNode<"X86ISD::FSETCC",  X86CmpMaskCCScalar>;
 
 def X86vshl    : SDNode<"X86ISD::VSHL",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -155,10 +164,13 @@ def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
-def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
-def X86testm  : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+def X86testm   : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>>;
+def X86testnm  : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>>;
+def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
@@ -216,6 +228,7 @@ def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
 def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
 def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
 def X86VPermv3   : SDNode<"X86ISD::VPERMV3",  SDTShuff3Op>;
+def X86VPermiv3   : SDNode<"X86ISD::VPERMIV3",  SDTShuff3Op>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
@@ -223,6 +236,8 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
 def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
                               [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
+def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
+                              [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
@@ -458,10 +473,13 @@ def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
 def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
 def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
 def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
 
 // 512-bit bitconvert pattern fragments
 def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
 def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
 
 def vzmovl_v2i64 : PatFrag<(ops node:$src),
                            (bitconvert (v2i64 (X86vzmovl
@@ -478,6 +496,14 @@ def fp32imm0 : PatLeaf<(f32 fpimm), [{
   return N->isExactlyValue(+0.0);
 }]>;
 
+def I8Imm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue());
+}]>;
+
+def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{ return Imm == 4; }]>;
+
 // BYTE_imm - Transform bit immediates into byte immediates.
 def BYTE_imm  : SDNodeXForm<imm, [{
   // Transformation function: imm >> 3
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 2461773..6450f2a 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -605,6 +605,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVDQA64rr,     X86::VMOVDQA64rm,         TB_ALIGN_64 },
     { X86::VMOVDQU32rr,     X86::VMOVDQU32rm,         0 },
     { X86::VMOVDQU64rr,     X86::VMOVDQU64rm,         0 },
+    { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
+    { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
 
     // AES foldable instructions
     { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
@@ -1210,8 +1212,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
 
     // AVX-512 foldable instructions
-    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
-    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
     { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
     { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
@@ -1224,17 +1224,31 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
     { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
     { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
     { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
     { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
+    { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
+    { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
+    { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
+    { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
     { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
     { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
     { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
     { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
     { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
+    { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
     { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
     { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
     { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
     { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+    { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
 
     // AES foldable instructions
     { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
@@ -1268,119 +1282,111 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
 
   static const X86OpTblEntry OpTbl3[] = {
     // FMA foldable instructions
-    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         0 },
-    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         0 },
-    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         0 },
-    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         0 },
-    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         0 },
-    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         0 },
-    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     0 },
-    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     0 },
-
-    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_16 },
-    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_16 },
-    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_16 },
-    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_16 },
-    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_16 },
-    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_16 },
-    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_32 },
-    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_32 },
-    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_32 },
-    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_32 },
-    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_32 },
-    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_32 },
-
-    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        0 },
-    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        0 },
-    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        0 },
-    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        0 },
-    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        0 },
-    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        0 },
-    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    0 },
-    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    0 },
-
-    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_16 },
-    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_16 },
-    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_16 },
-    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_16 },
-    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_16 },
-    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_16 },
-    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_32 },
-
-    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         0 },
-    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         0 },
-    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         0 },
-    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         0 },
-    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         0 },
-    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         0 },
-    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     0 },
-    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     0 },
-
-    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_16 },
-    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_16 },
-    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_16 },
-    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_16 },
-    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_16 },
-    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_16 },
-    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_32 },
-
-    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        0 },
-    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        0 },
-    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        0 },
-    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        0 },
-    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        0 },
-    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        0 },
-    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    0 },
-    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    0 },
-
-    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_16 },
-    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_16 },
-    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_16 },
-    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_16 },
-    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_16 },
-    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_16 },
-    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_32 },
-
-    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_16 },
-    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_16 },
-    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_16 },
-    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_16 },
-    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_16 },
-    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_16 },
-    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_32 },
-
-    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_16 },
-    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_16 },
-    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_16 },
-    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_16 },
-    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_16 },
-    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_16 },
-    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
+    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
+
+    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
+
+    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
+
+    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
+
+    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
+
+    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
+
+    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
+
+    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_NONE },
+
+    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_NONE },
+
+    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_NONE },
 
     // FMA4 foldable patterns
     { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
@@ -1420,6 +1426,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
     { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
     { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
+    { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
+    { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
+    { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
+    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1536,8 +1546,8 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
-  case X86::VMOVDQA32rm:
-  case X86::VMOVDQA64rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
     return true;
   }
 }
@@ -1563,6 +1573,8 @@ static bool isFrameStoreOpcode(int Opcode) {
   case X86::VMOVAPSYmr:
   case X86::VMOVAPDYmr:
   case X86::VMOVDQAYmr:
+  case X86::VMOVUPSZmr:
+  case X86::VMOVAPSZmr:
   case X86::MMX_MOVD64mr:
   case X86::MMX_MOVQ64mr:
   case X86::MMX_MOVNTQmr:
@@ -1621,9 +1633,9 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
   if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
     return false;
   bool isPICBase = false;
-  for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
-         E = MRI.def_end(); I != E; ++I) {
-    MachineInstr *DefMI = I.getOperand().getParent();
+  for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+         E = MRI.def_instr_end(); I != E; ++I) {
+    MachineInstr *DefMI = &*I;
     if (DefMI->getOpcode() != X86::MOVPC32r)
       return false;
     assert(!isPICBase && "More than one PIC base?");
@@ -1809,7 +1821,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
     MBB.insert(I, MI);
   }
 
-  MachineInstr *NewMI = prior(I);
+  MachineInstr *NewMI = std::prev(I);
   NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
 }
 
@@ -2452,6 +2464,41 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   }
 }
 
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  switch (MI->getOpcode()) {
+    case X86::VFMADDPDr231r:
+    case X86::VFMADDPSr231r:
+    case X86::VFMADDSDr231r:
+    case X86::VFMADDSSr231r:
+    case X86::VFMSUBPDr231r:
+    case X86::VFMSUBPSr231r:
+    case X86::VFMSUBSDr231r:
+    case X86::VFMSUBSSr231r:
+    case X86::VFNMADDPDr231r:
+    case X86::VFNMADDPSr231r:
+    case X86::VFNMADDSDr231r:
+    case X86::VFNMADDSSr231r:
+    case X86::VFNMSUBPDr231r:
+    case X86::VFNMSUBPSr231r:
+    case X86::VFNMSUBSDr231r:
+    case X86::VFNMSUBSSr231r:
+    case X86::VFMADDPDr231rY:
+    case X86::VFMADDPSr231rY:
+    case X86::VFMSUBPDr231rY:
+    case X86::VFMSUBPSr231rY:
+    case X86::VFNMADDPDr231rY:
+    case X86::VFNMADDPSr231rY:
+    case X86::VFNMSUBPDr231rY:
+    case X86::VFNMSUBPSr231rY:
+      SrcOpIdx1 = 2;
+      SrcOpIdx2 = 3;
+      return true;
+    default:
+      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  }
+}
+
 static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
@@ -2738,8 +2785,8 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (llvm::next(I) != MBB.end())
-        llvm::next(I)->eraseFromParent();
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
 
       Cond.clear();
       FBB = 0;
@@ -3015,6 +3062,11 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   return 0;
 }
 
+inline static bool MaskRegClassContains(unsigned Reg) {
+  return X86::VK8RegClass.contains(Reg) ||
+         X86::VK16RegClass.contains(Reg) ||
+         X86::VK1RegClass.contains(Reg);
+}
 static
 unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
   if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
@@ -3024,11 +3076,23 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
      SrcReg = get512BitSuperRegister(SrcReg);
      return X86::VMOVAPSZrr;
   }
-  if ((X86::VK8RegClass.contains(DestReg) ||
-       X86::VK16RegClass.contains(DestReg)) &&
-      (X86::VK8RegClass.contains(SrcReg) ||
-       X86::VK16RegClass.contains(SrcReg)))
+  if (MaskRegClassContains(DestReg) &&
+      MaskRegClassContains(SrcReg))
     return X86::KMOVWkk;
+  if (MaskRegClassContains(DestReg) &&
+      (X86::GR32RegClass.contains(SrcReg) ||
+       X86::GR16RegClass.contains(SrcReg) ||
+       X86::GR8RegClass.contains(SrcReg))) {
+    SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
+    return X86::KMOVWkr;
+  }
+  if ((X86::GR32RegClass.contains(DestReg) ||
+       X86::GR16RegClass.contains(DestReg) ||
+       X86::GR8RegClass.contains(DestReg)) &&
+       MaskRegClassContains(SrcReg)) {
+    DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
+    return X86::KMOVWrk;
+  }
   return 0;
 }
 
@@ -3837,6 +3901,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
   switch (MI->getOpcode()) {
+  case X86::MOV32r0:
+    return Expand2AddrUndef(MIB, get(X86::XOR32rr));
   case X86::SETB_C8r:
     return Expand2AddrUndef(MIB, get(X86::SBB8rr));
   case X86::SETB_C16r:
@@ -3861,6 +3927,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
+  case X86::KSET0B: 
   case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
   case X86::KSET1B:
   case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
@@ -4198,75 +4265,10 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   MI->addRegisterKilled(Reg, TRI, true);
 }
 
-static MachineInstr* foldPatchpoint(MachineFunction &MF,
-                                    MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops,
-                                    int FrameIndex,
-                                    const TargetInstrInfo &TII) {
-  unsigned StartIdx = 0;
-  switch (MI->getOpcode()) {
-  case TargetOpcode::STACKMAP:
-    StartIdx = 2; // Skip ID, nShadowBytes.
-    break;
-  case TargetOpcode::PATCHPOINT: {
-    // For PatchPoint, the call args are not foldable.
-    PatchPointOpers opers(MI);
-    StartIdx = opers.getVarIdx();
-    break;
-  }
-  default:
-    llvm_unreachable("unexpected stackmap opcode");
-  }
-
-  // Return false if any operands requested for folding are not foldable (not
-  // part of the stackmap's live values).
-  for (SmallVectorImpl<unsigned>::const_iterator I = Ops.begin(), E = Ops.end();
-       I != E; ++I) {
-    if (*I < StartIdx)
-      return 0;
-  }
-
-  MachineInstr *NewMI =
-    MF.CreateMachineInstr(TII.get(MI->getOpcode()), MI->getDebugLoc(), true);
-  MachineInstrBuilder MIB(MF, NewMI);
-
-  // No need to fold return, the meta data, and function arguments
-  for (unsigned i = 0; i < StartIdx; ++i)
-    MIB.addOperand(MI->getOperand(i));
-
-  for (unsigned i = StartIdx; i < MI->getNumOperands(); ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) {
-      assert(MO.getReg() && "patchpoint can only fold a vreg operand");
-      // Compute the spill slot size and offset.
-      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(MO.getReg());
-      unsigned SpillSize;
-      unsigned SpillOffset;
-      bool Valid = TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize,
-                                         SpillOffset, &MF.getTarget());
-      if (!Valid)
-        report_fatal_error("cannot spill patchpoint subregister operand");
-
-      MIB.addOperand(MachineOperand::CreateImm(StackMaps::IndirectMemRefOp));
-      MIB.addOperand(MachineOperand::CreateImm(SpillSize));
-      MIB.addOperand(MachineOperand::CreateFI(FrameIndex));
-      addOffset(MIB, SpillOffset);
-    }
-    else
-      MIB.addOperand(MO);
-  }
-  return NewMI;
-}
-
 MachineInstr*
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                     const SmallVectorImpl<unsigned> &Ops,
                                     int FrameIndex) const {
-  // Special case stack map and patch point intrinsics.
-  if (MI->getOpcode() == TargetOpcode::STACKMAP
-      || MI->getOpcode() == TargetOpcode::PATCHPOINT) {
-    return foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
-  }
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -5172,7 +5174,13 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
   { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
   { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
-  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr }
+  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
+  { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+  { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+  { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+  { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+  { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+  { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
 };
 
 // FIXME: Some shuffle and unpack instructions have equivalents in different
@@ -5315,7 +5323,7 @@ namespace {
     static char ID;
     CGBR() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
 
@@ -5362,11 +5370,11 @@ namespace {
       return true;
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "X86 PIC Global Base Reg Initialization";
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -5382,7 +5390,7 @@ namespace {
     static char ID;
     LDTLSCleanup() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
       if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
         // No point folding accesses if there isn't at least two.
@@ -5475,11 +5483,11 @@ namespace {
       return Copy;
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Local Dynamic TLS Access Clean-up";
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 600e392..156291e 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -14,7 +14,7 @@
 #ifndef X86INSTRUCTIONINFO_H
 #define X86INSTRUCTIONINFO_H
 
-#include "X86.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -111,23 +111,24 @@ inline static bool isScale(const MachineOperand &MO) {
 
 inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
   if (MI->getOperand(Op).isFI()) return true;
-  return Op+4 <= MI->getNumOperands() &&
-    MI->getOperand(Op  ).isReg() && isScale(MI->getOperand(Op+1)) &&
-    MI->getOperand(Op+2).isReg() &&
-    (MI->getOperand(Op+3).isImm() ||
-     MI->getOperand(Op+3).isGlobal() ||
-     MI->getOperand(Op+3).isCPI() ||
-     MI->getOperand(Op+3).isJTI());
+  return Op+X86::AddrSegmentReg <= MI->getNumOperands() &&
+    MI->getOperand(Op+X86::AddrBaseReg).isReg() &&
+    isScale(MI->getOperand(Op+X86::AddrScaleAmt)) &&
+    MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+    (MI->getOperand(Op+X86::AddrDisp).isImm() ||
+     MI->getOperand(Op+X86::AddrDisp).isGlobal() ||
+     MI->getOperand(Op+X86::AddrDisp).isCPI() ||
+     MI->getOperand(Op+X86::AddrDisp).isJTI());
 }
 
 inline static bool isMem(const MachineInstr *MI, unsigned Op) {
   if (MI->getOperand(Op).isFI()) return true;
-  return Op+5 <= MI->getNumOperands() &&
-    MI->getOperand(Op+4).isReg() &&
+  return Op+X86::AddrNumOperands <= MI->getNumOperands() &&
+    MI->getOperand(Op+X86::AddrSegmentReg).isReg() &&
     isLeaMem(MI, Op);
 }
 
-class X86InstrInfo : public X86GenInstrInfo {
+class X86InstrInfo final : public X86GenInstrInfo {
   X86TargetMachine &TM;
   const X86RegisterInfo RI;
 
@@ -161,7 +162,7 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+  const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
@@ -169,30 +170,32 @@ public:
   /// true, then it's expected the pre-extension value is available as a subreg
   /// of the result register. This also returns the sub-register index in
   /// SubIdx.
-  virtual bool isCoalescableExtInstr(const MachineInstr &MI,
-                                     unsigned &SrcReg, unsigned &DstReg,
-                                     unsigned &SubIdx) const;
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
   /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const;
+                                     int &FrameIndex) const override;
 
-  unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
   unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
-                                    int &FrameIndex) const;
+                                    int &FrameIndex) const override;
 
   bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
-                                         AliasAnalysis *AA) const;
+                                         AliasAnalysis *AA) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr *Orig,
-                     const TargetRegisterInfo &TRI) const;
+                     const TargetRegisterInfo &TRI) const override;
 
   /// Given an operand within a MachineInstr, insert preceding code to put it
   /// into the right format for a particular kind of LEA instruction. This may
@@ -217,65 +220,68 @@ public:
   /// This method returns a null pointer if the transformation cannot be
   /// performed, otherwise it returns the new instruction.
   ///
-  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                              MachineBasicBlock::iterator &MBBI,
-                                              LiveVariables *LV) const;
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
 
   /// commuteInstruction - We have a few instructions that must be hacked on to
   /// commute them.
   ///
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
+
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
 
   // Branch analysis.
-  virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-  virtual bool canInsertSelect(const MachineBasicBlock&,
-                               const SmallVectorImpl<MachineOperand> &Cond,
-                               unsigned, unsigned, int&, int&, int&) const;
-  virtual void insertSelect(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, DebugLoc DL,
-                            unsigned DstReg,
-                            const SmallVectorImpl<MachineOperand> &Cond,
-                            unsigned TrueReg, unsigned FalseReg) const;
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              MachineInstr::mmo_iterator MMOBegin,
-                              MachineInstr::mmo_iterator MMOEnd,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               MachineInstr::mmo_iterator MMOBegin,
-                               MachineInstr::mmo_iterator MMOEnd,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+  bool canInsertSelect(const MachineBasicBlock&,
+                       const SmallVectorImpl<MachineOperand> &Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI, DebugLoc DL,
+                    unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      MachineInstr::mmo_iterator MMOBegin,
+                      MachineInstr::mmo_iterator MMOEnd,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       MachineInstr::mmo_iterator MMOBegin,
+                       MachineInstr::mmo_iterator MMOEnd,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
   /// foldMemoryOperand - If this target supports it, fold a load or store of
   /// the specified stack slot into the specified machine instruction for the
@@ -283,33 +289,33 @@ public:
   /// folding and return true, otherwise it should return false.  If it folds
   /// the instruction, it is likely that the MachineInstruction the iterator
   /// references has been changed.
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
 
   /// foldMemoryOperand - Same as the previous version except it allows folding
   /// of any load and store from / to any address, not just from a specific
   /// stack slot.
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const;
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override;
 
   /// canFoldMemoryOperand - Returns true if the specified load / store is
   /// folding is possible.
-  virtual bool canFoldMemoryOperand(const MachineInstr*,
-                                    const SmallVectorImpl<unsigned> &) const;
+  bool canFoldMemoryOperand(const MachineInstr*,
+                            const SmallVectorImpl<unsigned> &) const override;
 
   /// unfoldMemoryOperand - Separate a single instruction which folded a load or
   /// a store or a load and a store into two or more instruction. If this is
   /// possible, returns true as well as the new instructions by reference.
-  virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                         unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                         SmallVectorImpl<MachineInstr*> &NewMIs) const override;
 
-  virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode*> &NewNodes) const;
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const override;
 
   /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
   /// instruction after load / store are unfolded from an instruction of the
@@ -317,17 +323,17 @@ public:
   /// possible. If LoadRegIndex is non-null, it is filled in with the operand
   /// index of the operand which will hold the register holding the loaded
   /// value.
-  virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                      bool UnfoldLoad, bool UnfoldStore,
-                                      unsigned *LoadRegIndex = 0) const;
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                              bool UnfoldLoad, bool UnfoldStore,
+                              unsigned *LoadRegIndex = 0) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
   /// to determine if two loads are loading from the same base address. It
   /// should only return true if the base pointers are the same and the
   /// only differences between the two addresses are the offset. It also returns
   /// the offsets by reference.
-  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                                       int64_t &Offset1, int64_t &Offset2) const;
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
   /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
@@ -337,21 +343,21 @@ public:
   /// from the common base address. It returns true if it decides it's desirable
   /// to schedule the two loads together. "NumLoads" is the number of loads that
   /// have already been scheduled after Load1.
-  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
-                                       int64_t Offset1, int64_t Offset2,
-                                       unsigned NumLoads) const;
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
 
-  virtual bool shouldScheduleAdjacent(MachineInstr* First,
-                                      MachineInstr *Second) const LLVM_OVERRIDE;
+  bool shouldScheduleAdjacent(MachineInstr* First,
+                              MachineInstr *Second) const override;
 
-  virtual void getNoopForMachoTarget(MCInst &NopInst) const;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
 
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
   /// instruction that defines the specified register class.
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
@@ -365,16 +371,17 @@ public:
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
   std::pair<uint16_t, uint16_t>
-  getExecutionDomain(const MachineInstr *MI) const;
+  getExecutionDomain(const MachineInstr *MI) const override;
 
-  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
 
-  unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
-                                        const TargetRegisterInfo *TRI) const;
+  unsigned
+    getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const override;
   unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
-                                const TargetRegisterInfo *TRI) const;
+                                const TargetRegisterInfo *TRI) const override;
   void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
@@ -382,27 +389,28 @@ public:
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       unsigned Size, unsigned Alignment) const;
 
-  bool isHighLatencyDef(int opc) const;
+  bool isHighLatencyDef(int opc) const override;
 
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
                              const MachineRegisterInfo *MRI,
                              const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI, unsigned UseIdx) const;
+                             const MachineInstr *UseMI,
+                             unsigned UseIdx) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                              unsigned &SrcReg2,
-                              int &CmpMask, int &CmpValue) const;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
 
   /// optimizeCompareInstr - Check if there exists an earlier instruction that
   /// operates on the same source operands and sets flags in the same way as
   /// Compare; remove Compare if possible.
-  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                                    unsigned SrcReg2, int CmpMask, int CmpValue,
-                                    const MachineRegisterInfo *MRI) const;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
 
   /// optimizeLoadInstr - Try to remove the load by folding it to a register
   /// operand at the use. We fold the load instructions if and only if the
@@ -411,10 +419,10 @@ public:
   /// defined by the load we are trying to fold. DefMI returns the machine
   /// instruction that defines FoldAsLoadDefReg, and the function returns
   /// the machine instruction generated due to folding.
-  virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
-                        const MachineRegisterInfo *MRI,
-                        unsigned &FoldAsLoadDefReg,
-                        MachineInstr *&DefMI) const;
+  MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+                                  const MachineRegisterInfo *MRI,
+                                  unsigned &FoldAsLoadDefReg,
+                                  MachineInstr *&DefMI) const override;
 
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 6e5d543..8edf873 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -23,8 +23,8 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3,
 
 def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
 
-def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
-def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
 
 def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
@@ -249,9 +249,6 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 
-def X86blsi   : SDNode<"X86ISD::BLSI",   SDTIntUnaryOp>;
-def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
-def X86blsr   : SDNode<"X86ISD::BLSR",   SDTIntUnaryOp>;
 def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntShiftOp>;
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
@@ -445,6 +442,46 @@ def brtarget8 : Operand<OtherVT>;
 
 }
 
+def X86SrcIdx8Operand : AsmOperandClass {
+  let Name = "SrcIdx8";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86SrcIdx16Operand : AsmOperandClass {
+  let Name = "SrcIdx16";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86SrcIdx32Operand : AsmOperandClass {
+  let Name = "SrcIdx32";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86SrcIdx64Operand : AsmOperandClass {
+  let Name = "SrcIdx64";
+  let RenderMethod = "addSrcIdxOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+def X86DstIdx8Operand : AsmOperandClass {
+  let Name = "DstIdx8";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86DstIdx16Operand : AsmOperandClass {
+  let Name = "DstIdx16";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86DstIdx32Operand : AsmOperandClass {
+  let Name = "DstIdx32";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86DstIdx64Operand : AsmOperandClass {
+  let Name = "DstIdx64";
+  let RenderMethod = "addDstIdxOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
 def X86MemOffs8AsmOperand : AsmOperandClass {
   let Name = "MemOffs8";
   let RenderMethod = "addMemOffsOperands";
@@ -465,19 +502,54 @@ def X86MemOffs64AsmOperand : AsmOperandClass {
   let RenderMethod = "addMemOffsOperands";
   let SuperClasses = [X86Mem64AsmOperand];
 }
-
 let OperandType = "OPERAND_MEMORY" in {
-def offset8 : Operand<i64> {
+def srcidx8 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx8Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx8"; }
+def srcidx16 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx16Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx16"; }
+def srcidx32 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx32Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx32"; }
+def srcidx64 : Operand<iPTR> {
+  let ParserMatchClass = X86SrcIdx64Operand;
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+  let PrintMethod = "printSrcIdx64"; }
+def dstidx8 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx8Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx8"; }
+def dstidx16 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx16Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx16"; }
+def dstidx32 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx32Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx32"; }
+def dstidx64 : Operand<iPTR> {
+  let ParserMatchClass = X86DstIdx64Operand;
+  let MIOperandInfo = (ops ptr_rc);
+  let PrintMethod = "printDstIdx64"; }
+def offset8 : Operand<iPTR> {
   let ParserMatchClass = X86MemOffs8AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
   let PrintMethod = "printMemOffs8"; }
-def offset16 : Operand<i64> {
+def offset16 : Operand<iPTR> {
   let ParserMatchClass = X86MemOffs16AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
   let PrintMethod = "printMemOffs16"; }
-def offset32 : Operand<i64> {
+def offset32 : Operand<iPTR> {
   let ParserMatchClass = X86MemOffs32AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
   let PrintMethod = "printMemOffs32"; }
-def offset64 : Operand<i64> {
+def offset64 : Operand<iPTR> {
   let ParserMatchClass = X86MemOffs64AsmOperand;
+  let MIOperandInfo = (ops i64imm, i8imm);
   let PrintMethod = "printMemOffs64"; }
 }
 
@@ -510,6 +582,10 @@ def GR32orGR64 : RegisterOperand<GR32> {
   let ParserMatchClass = X86GR32orGR64AsmOperand;
 }
 
+def AVX512RC : Operand<i32> {
+  let PrintMethod = "printRoundingControl";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -657,7 +733,8 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">,
+                     AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512       : Predicate<"!Subtarget->hasAVX512()">;
@@ -691,10 +768,16 @@ def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
-def In32BitMode  : Predicate<"!Subtarget->is64Bit()">,
-                             AssemblerPredicate<"!Mode64Bit", "32-bit mode">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+                             AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
                              AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
+                             AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+                             AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
+                             AssemblerPredicate<"Mode32Bit", "32-bit mode">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
 def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
@@ -845,10 +928,10 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 // Nop
 let neverHasSideEffects = 1, SchedRW = [WriteZero] in {
   def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
-  def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
-                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
-  def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
-                "nop{l}\t$zero", [], IIC_NOP>, TB;
+  def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+  def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+                "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
 }
 
 
@@ -860,7 +943,7 @@ let SchedRW = [WriteALU] in {
 let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
 def LEAVE    : I<0xC9, RawFrm,
                  (outs), (ins), "leave", [], IIC_LEAVE>,
-                 Requires<[In32BitMode]>;
+                 Requires<[Not64BitMode]>;
 
 let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
 def LEAVE64  : I<0xC9, RawFrm,
@@ -875,98 +958,110 @@ def LEAVE64  : I<0xC9, RawFrm,
 let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
 def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
-                IIC_POP_REG16>, OpSize;
+                IIC_POP_REG16>, OpSize16;
 def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
-                IIC_POP_REG>;
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
 def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
-                IIC_POP_REG>, OpSize;
+                IIC_POP_REG>, OpSize16;
 def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
-                IIC_POP_MEM>, OpSize;
+                IIC_POP_MEM>, OpSize16;
 def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
-                IIC_POP_REG>;
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
 def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
-                IIC_POP_MEM>;
+                IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
 
-def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+                OpSize16;
 def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
-               Requires<[In32BitMode]>;
+                OpSize32, Requires<[Not64BitMode]>;
 } // mayLoad, SchedRW
 
 let mayStore = 1, SchedRW = [WriteStore] in {
 def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
-                 IIC_PUSH_REG>, OpSize;
+                 IIC_PUSH_REG>, OpSize16;
 def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
-                 IIC_PUSH_REG>;
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
 def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
-                 IIC_PUSH_REG>, OpSize;
+                 IIC_PUSH_REG>, OpSize16;
 def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
-                 IIC_PUSH_MEM>,
-  OpSize;
+                 IIC_PUSH_MEM>, OpSize16;
 def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
-                 IIC_PUSH_REG>;
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
 def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
-                 IIC_PUSH_MEM>;
-
-def PUSHi8   : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
-                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
+                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
+                   Requires<[Not64BitMode]>;
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
 def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                      "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize;
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
+                   Requires<[Not64BitMode]>;
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
-                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
 
 def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
-                 OpSize;
+                 OpSize16;
 def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
-               Requires<[In32BitMode]>;
+               OpSize32, Requires<[Not64BitMode]>;
 
 } // mayStore, SchedRW
 }
 
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
 let mayLoad = 1, SchedRW = [WriteLoad] in {
-def POP64r   : I<0x58, AddRegFrm,
-                 (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
+def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
 def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
-                IIC_POP_REG>;
+                IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
 def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
-                IIC_POP_MEM>;
+                IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
 } // mayLoad, SchedRW
 let mayStore = 1, SchedRW = [WriteStore] in {
-def PUSH64r  : I<0x50, AddRegFrm,
-                 (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
+def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
 def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
-                 IIC_PUSH_REG>;
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
 def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
-                 IIC_PUSH_MEM>;
+                 IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
 } // mayStore, SchedRW
 }
 
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1,
     SchedRW = [WriteStore] in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
-                     "push{q}\t$imm", [], IIC_PUSH_IMM>;
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
 def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
-def PUSH64i32  : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
-                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
+                    "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
+                    Requires<[In64BitMode]>;
+def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
 }
 
 let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
 def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
-               Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+               OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
 let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
 def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
-                 Requires<[In64BitMode]>, Sched<[WriteStore]>;
+                 OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
-               Requires<[In32BitMode]>;
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+               OpSize16, Requires<[Not64BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
-               Requires<[In32BitMode]>;
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+               OpSize16, Requires<[Not64BitMode]>;
 }
 
 let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
@@ -974,7 +1069,7 @@ let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
 def BSWAP32r : I<0xC8, AddRegFrm,
                  (outs GR32:$dst), (ins GR32:$src),
                  "bswap{l}\t$dst",
-                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB;
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
 
 def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
                   "bswap{q}\t$dst",
@@ -986,86 +1081,106 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
-                  IIC_BIT_SCAN_REG>, TB, OpSize, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
-                  IIC_BIT_SCAN_MEM>, TB, OpSize, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
-                 IIC_BIT_SCAN_REG>, TB,
-               Sched<[WriteShift]>;
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
-                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
-                  IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
-                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
-                 IIC_BIT_SCAN_REG>,
-                 TB, OpSize, Sched<[WriteShift]>;
+                 IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
-                 IIC_BIT_SCAN_MEM>, TB,
-                 OpSize, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
-                 IIC_BIT_SCAN_REG>, TB,
-               Sched<[WriteShift]>;
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
-                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BIT_SCAN_REG>, TB,
-               Sched<[WriteShift]>;
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
-                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
-def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize;
-def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>;
-def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>;
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs dstidx8:$dst), (ins srcidx8:$src),
+              "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs dstidx16:$dst), (ins srcidx16:$src),
+              "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs dstidx32:$dst), (ins srcidx32:$src),
+              "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs dstidx64:$dst), (ins srcidx64:$src),
+               "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
 }
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>;
+def STOSB : I<0xAA, RawFrmDst, (outs dstidx8:$dst), (ins),
+              "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
 let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize;
+def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
+              "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>;
+def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
+              "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
 let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
-def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>;
+def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
+               "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
 
-def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>;
-def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize;
-def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>;
-def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>;
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+              "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+              "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+              "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+               "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
 
-def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
-def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
-def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
-def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+              "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+              "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+              "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+               "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+}
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1076,9 +1191,9 @@ let neverHasSideEffects = 1 in {
 def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
 def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
 def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 }
@@ -1089,16 +1204,28 @@ def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
                    [(set GR8:$dst, imm:$src)], IIC_MOV>;
 def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize;
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
 def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, imm:$src)], IIC_MOV>;
+                   [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+let isReMaterializable = 1 in {
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
                     [(set GR64:$dst, imm:$src)], IIC_MOV>;
-def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
-                      "mov{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
 }
 } // SchedRW
 
@@ -1108,84 +1235,125 @@ def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                    [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
 def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize;
+                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
 def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>;
-def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
-                      "mov{q}\t{$src, $dst|$dst, $src}",
-                      [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
 } // SchedRW
 
 let hasSideEffects = 0 in {
 
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
-/// 32-bit offset from the PC.  These are only valid in x86-32 mode.
+/// 32-bit offset from the segment base. These are only valid in x86-32 mode.
 let SchedRW = [WriteALU] in {
 let mayLoad = 1 in {
-def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
+let Defs = [AL] in
+def MOV8o8a : Ii32 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
                    Requires<[In32BitMode]>;
-def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize,
-                     Requires<[In32BitMode]>;
-def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
+let Defs = [AX] in
+def MOV16o16a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
+                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                      OpSize16, Requires<[In32BitMode]>;
+let Defs = [EAX] in
+def MOV32o32a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
                       "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
-                     Requires<[In32BitMode]>;
+                      OpSize32, Requires<[In32BitMode]>;
+
+let Defs = [AL] in
+def MOV8o8a_16 : Ii16 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
+                   "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                   AdSize, Requires<[In16BitMode]>;
+let Defs = [AX] in
+def MOV16o16a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
+                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                      OpSize16, AdSize, Requires<[In16BitMode]>;
+let Defs = [EAX] in
+def MOV32o32a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
+                      "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                      AdSize, OpSize32, Requires<[In16BitMode]>;
 }
 let mayStore = 1 in {
-def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
+let Uses = [AL] in
+def MOV8ao8 : Ii32 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
                   Requires<[In32BitMode]>;
-def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize,
-                     Requires<[In32BitMode]>;
-def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
+let Uses = [AX] in
+def MOV16ao16 : Ii32 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                      OpSize16, Requires<[In32BitMode]>;
+let Uses = [EAX] in
+def MOV32ao32 : Ii32 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
                       "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
-                     Requires<[In32BitMode]>;
+                     OpSize32, Requires<[In32BitMode]>;
+
+let Uses = [AL] in
+def MOV8ao8_16 : Ii16 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
+                   "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
+                  AdSize, Requires<[In16BitMode]>;
+let Uses = [AX] in
+def MOV16ao16_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                      OpSize16, AdSize, Requires<[In16BitMode]>;
+let Uses = [EAX] in
+def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
+                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize, Requires<[In16BitMode]>;
 }
 }
 
 // These forms all have full 64-bit absolute addresses in their instructions
 // and use the movabs mnemonic to indicate this specific form.
 let mayLoad = 1 in {
-def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset8:$src),
+let Defs = [AL] in
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset8:$src),
                      "movabs{b}\t{$src, %al|al, $src}", []>,
                      Requires<[In64BitMode]>;
-def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset16:$src),
-                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize,
+let Defs = [AX] in
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16,
                      Requires<[In64BitMode]>;
-def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset32:$src),
-                     "movabs{l}\t{$src, %eax|eax, $src}", []>,
+let Defs = [EAX] in
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset32:$src),
+                     "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
                      Requires<[In64BitMode]>;
-def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src),
+let Defs = [RAX] in
+def MOV64o64a : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64:$src),
                      "movabs{q}\t{$src, %rax|rax, $src}", []>,
                      Requires<[In64BitMode]>;
 }
 
 let mayStore = 1 in {
-def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset8:$dst), (ins),
+let Uses = [AL] in
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins),
                      "movabs{b}\t{%al, $dst|$dst, al}", []>,
                      Requires<[In64BitMode]>;
-def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset16:$dst), (ins),
-                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize,
+let Uses = [AX] in
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16,
                      Requires<[In64BitMode]>;
-def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset32:$dst), (ins),
-                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
+let Uses = [EAX] in
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins),
+                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
                      Requires<[In64BitMode]>;
-def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins),
+let Uses = [RAX] in
+def MOV64ao64 : RIi64<0xA3, RawFrmMemOffs, (outs offset64:$dst), (ins),
                      "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
                      Requires<[In64BitMode]>;
 }
 } // hasSideEffects = 0
 
-let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
                    "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 }
@@ -1196,10 +1364,10 @@ def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                 [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
 def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize;
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
 def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>;
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
 def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
                  [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
@@ -1211,10 +1379,10 @@ def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
                 [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
 def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize;
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
 def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>;
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
 def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
                  [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
@@ -1261,10 +1429,11 @@ let SchedRW = [WriteALU] in {
 def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                "bt{w}\t{$src2, $src1|$src1, $src2}",
                [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
-               OpSize, TB;
+               OpSize16, TB;
 def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
                "bt{l}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB;
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+               OpSize32, TB;
 def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                "bt{q}\t{$src2, $src1|$src1, $src2}",
                [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
@@ -1281,13 +1450,13 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
   //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
   //                (implicit EFLAGS)]
                  [], IIC_BT_MR
-                 >, OpSize, TB, Requires<[FastBTMem]>;
+                 >, OpSize16, TB, Requires<[FastBTMem]>;
   def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
                  "bt{l}\t{$src2, $src1|$src1, $src2}",
   //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
   //                (implicit EFLAGS)]
                  [], IIC_BT_MR
-                 >, TB, Requires<[FastBTMem]>;
+                 >, OpSize32, TB, Requires<[FastBTMem]>;
   def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                  "bt{q}\t{$src2, $src1|$src1, $src2}",
   //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
@@ -1300,11 +1469,11 @@ let SchedRW = [WriteALU] in {
 def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
-                IIC_BT_RI>, OpSize, TB;
+                IIC_BT_RI>, OpSize16, TB;
 def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
-                IIC_BT_RI>, TB;
+                IIC_BT_RI>, OpSize32, TB;
 def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
@@ -1318,11 +1487,11 @@ let SchedRW = [WriteALU] in {
 def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
-                 ], IIC_BT_MI>, OpSize, TB;
+                 ], IIC_BT_MI>, OpSize16, TB;
 def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
-                 ], IIC_BT_MI>, TB;
+                 ], IIC_BT_MI>, OpSize32, TB;
 def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi64 addr:$src1),
@@ -1333,9 +1502,10 @@ let hasSideEffects = 0 in {
 let SchedRW = [WriteALU] in {
 def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                 "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
-                OpSize, TB;
+                OpSize16, TB;
 def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
 def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                  "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 } // SchedRW
@@ -1343,9 +1513,10 @@ def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
                 "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
-                OpSize, TB;
+                OpSize16, TB;
 def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
 def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                  "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 }
@@ -1353,9 +1524,10 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
 let SchedRW = [WriteALU] in {
 def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                     "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize, TB;
+                    OpSize16, TB;
 def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
 def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                     "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 } // SchedRW
@@ -1363,9 +1535,10 @@ def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
                     "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize, TB;
+                    OpSize16, TB;
 def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
 def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                     "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 }
@@ -1373,9 +1546,10 @@ def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 let SchedRW = [WriteALU] in {
 def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                 "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
-                OpSize, TB;
+                OpSize16, TB;
 def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
 def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                  "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 } // SchedRW
@@ -1383,9 +1557,10 @@ def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
                 "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
-                OpSize, TB;
+                OpSize16, TB;
 def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
 def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                  "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 }
@@ -1393,9 +1568,10 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
 let SchedRW = [WriteALU] in {
 def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                     "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize, TB;
+                    OpSize16, TB;
 def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
 def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                     "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 } // SchedRW
@@ -1403,9 +1579,10 @@ def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
                     "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize, TB;
+                    OpSize16, TB;
 def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
 def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                     "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 }
@@ -1413,19 +1590,21 @@ def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 let SchedRW = [WriteALU] in {
 def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                 "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
-                OpSize, TB;
+                OpSize16, TB;
 def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+              OpSize32, TB;
 def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
-                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+               "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 } // SchedRW
 
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
-                OpSize, TB;
+              "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize16, TB;
 def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+              "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize32, TB;
 def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                  "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 }
@@ -1433,9 +1612,10 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
 let SchedRW = [WriteALU] in {
 def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                     "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
-                    OpSize, TB;
+                    OpSize16, TB;
 def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
 def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                     "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 } // SchedRW
@@ -1443,9 +1623,10 @@ def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
 def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
                     "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
-                    OpSize, TB;
+                    OpSize16, TB;
 def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
 def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                     "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 }
@@ -1475,14 +1656,14 @@ multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
                       [(set
                          GR16:$dst,
                          (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
-                      itin>, OpSize;
+                      itin>, OpSize16;
     def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$val, i32mem:$ptr),
                       !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
                       [(set
                          GR32:$dst,
                          (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
-                      itin>;
+                      itin>, OpSize32;
     def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
                        (ins GR64:$val, i64mem:$ptr),
                        !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
@@ -1501,24 +1682,30 @@ let Constraints = "$val = $dst" in {
 def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
                 "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
-                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize;
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize16;
 def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
-                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize32;
 def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
                   "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 }
 
 // Swap between EAX and other registers.
+let Uses = [AX], Defs = [AX] in
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize;
+                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
                   "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
-                  Requires<[In32BitMode]>;
+                  OpSize32, Requires<[Not64BitMode]>;
+let Uses = [EAX], Defs = [EAX] in
 // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
 // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
 def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
                    "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
-                   Requires<[In64BitMode]>;
+                   OpSize32, Requires<[In64BitMode]>;
+let Uses = [RAX], Defs = [RAX] in
 def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
                   "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
 } // SchedRW
@@ -1528,9 +1715,10 @@ def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
                  "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
-                 OpSize;
+                 OpSize16;
 def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize32;
 def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
                    "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 } // SchedRW
@@ -1540,9 +1728,10 @@ def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
                  "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                  "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
-                 OpSize;
+                 OpSize16;
 def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize32;
 def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                    "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 
@@ -1554,10 +1743,10 @@ def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
                    IIC_CMPXCHG_REG8>, TB;
 def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
-                    IIC_CMPXCHG_REG>, TB, OpSize;
+                    IIC_CMPXCHG_REG>, TB, OpSize16;
 def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
                      "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_REG>, TB;
+                     IIC_CMPXCHG_REG>, TB, OpSize32;
 def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
                       "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
                       IIC_CMPXCHG_REG>, TB;
@@ -1570,10 +1759,10 @@ def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
                      IIC_CMPXCHG_MEM8>, TB;
 def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                      "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_MEM>, TB, OpSize;
+                     IIC_CMPXCHG_MEM>, TB, OpSize16;
 def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                      "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
-                     IIC_CMPXCHG_MEM>, TB;
+                     IIC_CMPXCHG_MEM>, TB, OpSize32;
 def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                       "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
                       IIC_CMPXCHG_MEM>, TB;
@@ -1594,7 +1783,8 @@ def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
 def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
 
 // Rex64 instruction prefix
-def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>;
+def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
+                     Requires<[In64BitMode]>;
 
 // Data16 instruction prefix
 def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
@@ -1611,16 +1801,41 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
 
 // String manipulation instructions
 let SchedRW = [WriteMicrocoded] in {
-def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
-def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
-def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
-def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+              "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+               "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
 }
 
 let SchedRW = [WriteSystem] in {
-def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
-def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
-def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+             "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+def INSB : I<0x6C, RawFrmDst, (outs dstidx8:$dst), (ins),
+             "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+def INSW : I<0x6D, RawFrmDst, (outs dstidx16:$dst), (ins),
+             "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>,  OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs dstidx32:$dst), (ins),
+             "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+}
 }
 
 // Flag instructions
@@ -1644,50 +1859,50 @@ let SchedRW = [WriteMicrocoded] in {
 // ASCII Adjust After Addition
 // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
 def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
-            Requires<[In32BitMode]>;
+            Requires<[Not64BitMode]>;
 
 // ASCII Adjust AX Before Division
 // sets AL, AH and EFLAGS and uses AL and AH
 def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
-                 "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>;
+                 "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
 
 // ASCII Adjust AX After Multiply
 // sets AL, AH and EFLAGS and uses AL
 def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
-                 "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>;
+                 "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
 
 // ASCII Adjust AL After Subtraction - sets
 // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
 def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
-            Requires<[In32BitMode]>;
+            Requires<[Not64BitMode]>;
 
 // Decimal Adjust AL after Addition
 // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
 def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
-            Requires<[In32BitMode]>;
+            Requires<[Not64BitMode]>;
 
 // Decimal Adjust AL after Subtraction
 // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
 def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
-            Requires<[In32BitMode]>;
+            Requires<[Not64BitMode]>;
 } // SchedRW
 
 let SchedRW = [WriteSystem] in {
 // Check Array Index Against Bounds
 def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
-                   Requires<[In32BitMode]>;
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+                   Requires<[Not64BitMode]>;
 def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>,
-                   Requires<[In32BitMode]>;
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+                   Requires<[Not64BitMode]>;
 
 // Adjust RPL Field of Segment Selector
 def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
                  "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
-                 Requires<[In32BitMode]>;
+                 Requires<[Not64BitMode]>;
 def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                  "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
-                 Requires<[In32BitMode]>;
+                 Requires<[Not64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1698,29 +1913,29 @@ let Predicates = [HasMOVBE] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
-                    OpSize, T8;
+                    OpSize16, T8PS;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
-                    T8;
+                    OpSize32, T8PS;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
-                     T8;
+                     T8PS;
   }
   let SchedRW = [WriteStore] in {
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
-                    OpSize, T8;
+                    OpSize16, T8PS;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
-                    T8;
+                    OpSize32, T8PS;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
-                     T8;
+                     T8PS;
   }
 }
 
@@ -1730,10 +1945,10 @@ let Predicates = [HasMOVBE] in {
 let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
                     "rdrand{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB;
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
                     "rdrand{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB;
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
                      "rdrand{q}\t$dst",
                      [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
@@ -1745,10 +1960,10 @@ let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
 let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
   def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
                     "rdseed{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize, TB;
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
   def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
                     "rdseed{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, TB;
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
   def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
                      "rdseed{q}\t$dst",
                      [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
@@ -1761,19 +1976,20 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
   def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
-                    OpSize;
+                    OpSize16;
   def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize;
+                     (implicit EFLAGS)]>, XS, OpSize16;
 
   def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS;
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
   def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
-                     (implicit EFLAGS)]>, XS;
+                     (implicit EFLAGS)]>, XS, OpSize32;
 
   def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
@@ -1792,19 +2008,20 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
   def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
-                    OpSize;
+                    OpSize16;
   def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz (loadi16 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize;
+                     (implicit EFLAGS)]>, XS, OpSize16;
 
   def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS;
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
   def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (cttz (loadi32 addr:$src))),
-                     (implicit EFLAGS)]>, XS;
+                     (implicit EFLAGS)]>, XS, OpSize32;
 
   def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
@@ -1817,30 +2034,47 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
 }
 
 multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
-                  RegisterClass RC, X86MemOperand x86memop, SDNode OpNode,
-                  PatFrag ld_frag> {
+                  RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
   def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
-             [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V;
+             []>, T8PS, VEX_4V;
+  let mayLoad = 1 in
   def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
-             [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>,
-             T8, VEX_4V;
+             []>, T8PS, VEX_4V;
+}
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
-  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem,
-                        X86blsr, loadi32>;
-  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem,
-                        X86blsr, loadi64>, VEX_W;
-  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem,
-                          X86blsmsk, loadi32>;
-  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem,
-                          X86blsmsk, loadi64>, VEX_W;
-  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem,
-                        X86blsi, loadi32>;
-  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem,
-                        X86blsi, loadi64>, VEX_W;
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBMI] in {
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+            (BLSR32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+            (BLSR64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+            (BLSMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+            (BLSMSK64rr GR64:$src)>;
+
+  def : Pat<(and GR32:$src, (ineg GR32:$src)),
+            (BLSI32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (ineg GR64:$src)),
+            (BLSI64rr GR64:$src)>;
 }
 
 multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
@@ -1849,11 +2083,11 @@ multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
   def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-             T8, VEX_4VOp3;
+             T8PS, VEX_4VOp3;
   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
-              (implicit EFLAGS)]>, T8, VEX_4VOp3;
+              (implicit EFLAGS)]>, T8PS, VEX_4VOp3;
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
@@ -1930,17 +2164,18 @@ multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
                 !strconcat(OpcodeStr,
                            "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
                 [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
-           XOP, XOPA, VEX;
+           XOP, XOPA;
   def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
                 (ins x86memop:$src1, immtype:$cntl),
                 !strconcat(OpcodeStr,
                            "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
                 [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
-           XOP, XOPA, VEX;
+           XOP, XOPA;
 }
 
 defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
                                      int_x86_tbm_bextri_u32, i32imm, imm>;
+let ImmT = Imm32S in
 defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
                                      int_x86_tbm_bextri_u64, i64i32imm,
                                      i64immSExt32>, VEX_W;
@@ -1951,11 +2186,11 @@ multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
 let hasSideEffects = 0 in {
   def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
-             []>, XOP, XOP9, VEX_4V;
+             []>, XOP_4V, XOP9;
   let mayLoad = 1 in
   def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
-             []>, XOP, XOP9, VEX_4V;
+             []>, XOP_4V, XOP9;
 }
 }
 
@@ -2088,6 +2323,7 @@ include "X86InstrCompiler.td"
 // Assembler Mnemonic Aliases
 //===----------------------------------------------------------------------===//
 
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
 
@@ -2098,17 +2334,20 @@ def : MnemonicAlias<"cdq",  "cltd", "att">;
 def : MnemonicAlias<"cdqe", "cltq", "att">;
 def : MnemonicAlias<"cqo",  "cqto", "att">;
 
-// lret maps to lretl, it is not ambiguous with lretq.
-def : MnemonicAlias<"lret", "lretl", "att">;
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
 
-def : MnemonicAlias<"leavel", "leave", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
 def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
 
 def : MnemonicAlias<"loopz",  "loope",  "att">;
 def : MnemonicAlias<"loopnz", "loopne", "att">;
 
+def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pop",   "popq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf",  "popfw", "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"popf",  "popfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"popf",  "popfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"popfd", "popfl", "att">;
@@ -2116,21 +2355,33 @@ def : MnemonicAlias<"popfd", "popfl", "att">;
 // FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
 // all modes.  However: "push (addr)" and "push $42" should default to
 // pushl/pushq depending on the current mode.  Similar for "pop %bx"
+def : MnemonicAlias<"push",   "pushw",  "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"push",   "pushl",  "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"push",   "pushq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfw", "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushfd", "pushfl", "att">;
 
-def : MnemonicAlias<"popad",   "popa", "intel">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"pushad",  "pusha", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popad",  "popal",  "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa",   "popaw",  "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa",   "popaw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "att">, Requires<[In32BitMode]>;
 
 def : MnemonicAlias<"repe",  "rep",   "att">;
 def : MnemonicAlias<"repz",  "rep",   "att">;
 def : MnemonicAlias<"repnz", "repne", "att">;
 
-def : MnemonicAlias<"retl", "ret", "att">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"retq", "ret", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
 
 def : MnemonicAlias<"salb", "shlb", "att">;
 def : MnemonicAlias<"salw", "shlw", "att">;
@@ -2146,18 +2397,23 @@ def : MnemonicAlias<"ud2a",  "ud2",  "att">;
 def : MnemonicAlias<"verrw", "verr", "att">;
 
 // System instruction aliases.
-def : MnemonicAlias<"iret",    "iretl",    "att">;
+def : MnemonicAlias<"iret",    "iretw",    "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret",    "iretl",    "att">, Requires<[Not16BitMode]>;
 def : MnemonicAlias<"sysret",  "sysretl",  "att">;
 def : MnemonicAlias<"sysexit", "sysexitl", "att">;
 
-def : MnemonicAlias<"lgdtl", "lgdt", "att">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"lgdtq", "lgdt", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"lidtl", "lidt", "att">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"lidtq", "lidt", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"sgdtl", "sgdt", "att">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"sgdtq", "sgdt", "att">, Requires<[In64BitMode]>;
-def : MnemonicAlias<"sidtl", "sidt", "att">, Requires<[In32BitMode]>;
-def : MnemonicAlias<"sidtq", "sidt", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
 
 
 // Floating point stack aliases.
@@ -2241,6 +2497,42 @@ def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
 def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
 def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
 
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb $src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
 // div and idiv aliases for explicit A register.
 def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
 def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
@@ -2325,10 +2617,22 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
-def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
-def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst)>;
-def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst)>;
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst)>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst)>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst)>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst)>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst)>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst)>, Requires<[In64BitMode]>;
+def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst)>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst)>, Requires<[In32BitMode]>;
+def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst)>, Requires<[In16BitMode]>;
+
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
 def : InstAlias<"imulw $imm, $r", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm)>;
@@ -2348,8 +2652,10 @@ def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
 
 
 // jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
-def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>;
-def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"call $seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
 def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
 def : InstAlias<"jmpw $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
 def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
@@ -2370,10 +2676,6 @@ def : InstAlias<"movq $src, $dst",
 def : InstAlias<"movq $src, $dst",
                 (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
-// movsd with no operands (as opposed to the SSE scalar move of a double) is an
-// alias for movsl. (as in rep; movsd)
-def : InstAlias<"movsd", (MOVSD), 0>;
-
 // movsx aliases
 def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
 def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
@@ -2462,6 +2764,6 @@ def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:
 
 // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
 def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[Not64BitMode]>;
 def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
 def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index ba58143..050ee39 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -524,24 +524,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                      MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                       MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
 defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
                          i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
 let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                          SSEPackedSingle>, TB;
+                          SSEPackedSingle>, PS;
 }
 
 // Extract / Insert
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a5debc0..f2f3967 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -120,6 +120,11 @@ def SSE_DIV_ITINS_P : SizeItins<
   SSE_DIV_F32P, SSE_DIV_F64P
 >;
 
+let Sched = WriteVecLogic in
+def SSE_VEC_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
 def SSE_BIT_ITINS_P : OpndItins<
   IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
 >;
@@ -171,6 +176,7 @@ def SSE_INSERT_ITINS : OpndItins<
   IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
 >;
 
+let Sched = WriteMPSAD in
 def SSE_MPSADBW_ITINS : OpndItins<
   IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
 >;
@@ -179,6 +185,44 @@ def SSE_PMULLD_ITINS : OpndItins<
   IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
 >;
 
+// Definitions for backward compatibility.
+// The instructions mapped on these definitions uses a different itinerary
+// than the actual scheduling model.
+let Sched = WriteShuffle in
+def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVecIMul in
+def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteShuffle in
+def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteMPSAD in
+def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteBlend in
+def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
@@ -210,6 +254,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
                              Operand memopr, ComplexPattern mem_cpat,
                              OpndItins itins,
                              bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
   def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
@@ -227,6 +272,7 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
              RC:$src1, mem_cpat:$src2))], itins.rm>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
+}
 
 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -497,14 +543,14 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
               !strconcat(base_opc, asm_opr),
               [(set VR128:$dst, (vt (OpNode VR128:$src1,
                                  (scalar_to_vector RC:$src2))))],
-              IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
+              IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
 
   // For the disassembler
-  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                   (ins VR128:$src1, RC:$src2),
                   !strconcat(base_opc, asm_opr),
-                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
@@ -800,7 +846,7 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
 let neverHasSideEffects = 1 in
   def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
-           Sched<[WriteMove]>;
+           Sched<[WriteFShuffle]>;
 let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
   def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
@@ -810,41 +856,41 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
 
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              TB, VEX;
+                              PS, VEX;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              TB, OpSize, VEX;
+                              PD, VEX;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              TB, VEX;
+                              PS, VEX;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
-                              TB, OpSize, VEX;
+                              PD, VEX;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              TB, VEX, VEX_L;
+                              PS, VEX, VEX_L;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              TB, OpSize, VEX, VEX_L;
+                              PD, VEX, VEX_L;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              TB, VEX, VEX_L;
+                              PS, VEX, VEX_L;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
-                              TB, OpSize, VEX, VEX_L;
+                              PD, VEX, VEX_L;
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              TB;
+                              PS;
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              TB, OpSize;
+                              PD;
 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              TB;
+                              PS;
 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
-                              TB, OpSize;
+                              PD;
 
 let SchedRW = [WriteStore] in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
@@ -882,7 +928,8 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
 } // SchedRW
 
 // For disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteFShuffle] in {
   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movaps\t{$src, $dst|$dst, $src}", [],
@@ -958,7 +1005,8 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
 } // SchedRW
 
 // For disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movaps\t{$src, $dst|$dst, $src}", [],
                          IIC_SSE_MOVA_P_RR>;
@@ -1138,16 +1186,16 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
      [(set VR128:$dst,
        (psnode VR128:$src1,
               (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
-              itin, SSEPackedSingle>, TB,
-     Sched<[WriteShuffleLd, ReadAfterLd]>;
+              itin, SSEPackedSingle>, PS,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
 
   def PDrm : PI<opc, MRMSrcMem,
          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
          !strconcat(base_opc, "d", asm_opr),
      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
                               (scalar_to_vector (loadf64 addr:$src2)))))],
-              itin, SSEPackedDouble>, TB, OpSize,
-     Sched<[WriteShuffleLd, ReadAfterLd]>;
+              itin, SSEPackedDouble>, PD,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
 
 }
 
@@ -1345,14 +1393,14 @@ let AddedComplexity = 20, Predicates = [UseAVX] in {
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>;
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -1360,13 +1408,13 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                       "movlhps\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
   def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $dst|$dst, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
-                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
 }
 
 let Predicates = [UseAVX] in {
@@ -1632,40 +1680,43 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
                    sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
 
-let Predicates = [UseAVX] in {
-defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
-          SSE_CVT_Scalar, 0>, XS, VEX_4V;
-defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
-          SSE_CVT_Scalar, 0>, XS, VEX_4V,
-          VEX_W;
-defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
-          SSE_CVT_Scalar, 0>, XD, VEX_4V;
-defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
-          SSE_CVT_Scalar, 0>, XD,
-          VEX_4V, VEX_W;
-}
-let Constraints = "$src1 = $dst" in {
-  defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-                        int_x86_sse_cvtsi2ss, i32mem, loadi32,
-                        "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
-  defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-                        int_x86_sse_cvtsi642ss, i64mem, loadi64,
-                        "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
-  defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-                        int_x86_sse2_cvtsi2sd, i32mem, loadi32,
-                        "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
-  defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-                        int_x86_sse2_cvtsi642sd, i64mem, loadi64,
-                        "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
-}
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in {
+  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V;
+  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V,
+            VEX_W;
+  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, VEX_4V;
+  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD,
+            VEX_4V, VEX_W;
+  }
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
+                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
+                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+  }
+} // isCodeGenOnly = 1
 
 /// SSE 1 Only
 
 // Aliases for intrinsics
+let isCodeGenOnly = 1 in {
 let Predicates = [UseAVX] in {
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
@@ -1694,6 +1745,7 @@ defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
 defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+} // isCodeGenOnly = 1
 
 let Predicates = [UseAVX] in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
@@ -1713,16 +1765,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               TB, VEX, Requires<[HasAVX]>;
+                               PS, VEX, Requires<[HasAVX]>;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               TB, VEX, VEX_L, Requires<[HasAVX]>;
+                               PS, VEX, VEX_L, Requires<[HasAVX]>;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, SSE_CVT_PS>,
-                            TB, Requires<[UseSSE2]>;
+                            PS, Requires<[UseSSE2]>;
 
 let Predicates = [UseAVX] in {
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1792,6 +1844,7 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       XD,
                   Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
 
+let isCodeGenOnly = 1 in {
 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1823,6 +1876,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 }
+} // isCodeGenOnly = 1
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
@@ -1875,6 +1929,7 @@ def : Pat<(fextend (loadf32 addr:$src)),
 def : Pat<(extloadf32 addr:$src),
           (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
 
+let isCodeGenOnly = 1 in {
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1905,6 +1960,7 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                     IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 }
+} // isCodeGenOnly = 1
 
 // Convert packed single/double fp to doubleword
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2116,32 +2172,32 @@ let Predicates = [HasAVX] in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>;
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                    IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>;
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 }
 
 let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                   IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>;
+                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -2287,7 +2343,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
            Sched<[itins.Sched.Folded, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
-  let neverHasSideEffects = 1 in {
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
                       (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
                       IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
@@ -2299,23 +2355,23 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
-defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S>,
                  XS, VEX_4V, VEX_LIG;
-defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S>, // same latency as 32 bit compare
                  XD, VEX_4V, VEX_LIG;
 
 let Constraints = "$src1 = $dst" in {
-  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
                   XS;
-  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                   SSE_ALU_F64S>,
@@ -2338,23 +2394,25 @@ multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
            Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
-// Aliases to match intrinsics which expect XMM operand(s).
-defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
-                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                     SSE_ALU_F32S>,
-                     XS, VEX_4V;
-defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
-                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                     SSE_ALU_F32S>, // same latency as f32
-                     XD, VEX_4V;
-let Constraints = "$src1 = $dst" in {
-  defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
-                       "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                       SSE_ALU_F32S>, XS;
-  defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
-                       "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                       SSE_ALU_F64S>,
-                       XD;
+let isCodeGenOnly = 1 in {
+  // Aliases to match intrinsics which expect XMM operand(s).
+  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S>,
+                       XS, VEX_4V;
+  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S>, // same latency as f32
+                       XD, VEX_4V;
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F32S>, XS;
+    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F64S>,
+                         XD;
+}
 }
 
 
@@ -2377,46 +2435,50 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss">, TB, VEX, VEX_LIG;
+                                  "ucomiss">, PS, VEX, VEX_LIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd">, TB, OpSize, VEX, VEX_LIG;
+                                  "ucomisd">, PD, VEX, VEX_LIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss">, TB, VEX, VEX_LIG;
+                                    "comiss">, PS, VEX, VEX_LIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd">, TB, OpSize, VEX, VEX_LIG;
+                                    "comisd">, PD, VEX, VEX_LIG;
   }
 
-  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                            load, "ucomiss">, TB, VEX;
-  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                            load, "ucomisd">, TB, OpSize, VEX;
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, VEX;
+    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, VEX;
 
-  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                            load, "comiss">, TB, VEX;
-  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                            load, "comisd">, TB, OpSize, VEX;
+    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, VEX;
+    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, VEX;
+  }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss">, TB;
+                                  "ucomiss">, PS;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd">, TB, OpSize;
+                                  "ucomisd">, PD;
 
   let Pattern = []<dag> in {
     defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss">, TB;
+                                    "comiss">, PS;
     defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd">, TB, OpSize;
+                                    "comisd">, PD;
   }
 
-  defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss">, TB;
-  defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd">, TB, OpSize;
+  let isCodeGenOnly = 1 in {
+    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                                load, "ucomiss">, PS;
+    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                                load, "ucomisd">, PD;
 
-  defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
-                                  "comiss">, TB;
-  defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
-                                  "comisd">, TB, OpSize;
+    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+                                    "comiss">, PS;
+    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+                                    "comisd">, PD;
+  }
 } // Defs = [EFLAGS]
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
@@ -2436,7 +2498,7 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
-  let neverHasSideEffects = 1 in {
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : PIi8<0xC2, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
                asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
@@ -2450,28 +2512,28 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, TB, VEX_4V;
+               SSEPackedSingle>, PS, VEX_4V;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, TB, OpSize, VEX_4V;
+               SSEPackedDouble>, PD, VEX_4V;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, TB, VEX_4V, VEX_L;
+               SSEPackedSingle>, PS, VEX_4V, VEX_L;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+               SSEPackedDouble>, PD, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedSingle, SSE_ALU_F32P>, TB;
+                 SSEPackedSingle, SSE_ALU_F32P>, PS;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize;
+                 SSEPackedDouble, SSE_ALU_F64P>, PD;
 }
 
 let Predicates = [HasAVX] in {
@@ -2512,7 +2574,7 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
 // SSE 1 & 2 - Shuffle Instructions
 //===----------------------------------------------------------------------===//
 
-/// sse12_shuffle - sse 1 & 2 shuffle instructions
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                          ValueType vt, string asm, PatFrag mem_frag,
                          Domain d, bit IsConvertibleToThreeAddress = 0> {
@@ -2520,37 +2582,35 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                    (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
                                        (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
-            Sched<[WriteShuffleLd, ReadAfterLd]>;
+            Sched<[WriteFShuffleLd, ReadAfterLd]>;
   let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
     def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
                                        (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
-              Sched<[WriteShuffle]>;
+              Sched<[WriteFShuffle]>;
 }
 
 defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f32, SSEPackedSingle>, TB, VEX_4V;
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
-                    TB;
+                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS;
   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>,
-                    TB, OpSize;
+                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD;
 }
 
 let Predicates = [HasAVX] in {
@@ -2598,10 +2658,10 @@ let Predicates = [UseSSE2] in {
 }
 
 //===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Unpack Instructions
+// SSE 1 & 2 - Unpack FP Instructions
 //===----------------------------------------------------------------------===//
 
-/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
                                    PatFrag mem_frag, RegisterClass RC,
                                    X86MemOperand x86memop, string asm,
@@ -2610,55 +2670,55 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
                 asm, [(set RC:$dst,
                            (vt (OpNode RC:$src1, RC:$src2)))],
-                           IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>;
+                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
     def rm : PI<opc, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                 asm, [(set RC:$dst,
                            (vt (OpNode RC:$src1,
                                        (mem_frag addr:$src2))))],
                                        IIC_SSE_UNPCK, d>,
-             Sched<[WriteShuffleLd, ReadAfterLd]>;
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
 }
 
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
-                       SSEPackedSingle>, TB;
+                       SSEPackedSingle>, PS;
   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
-                       SSEPackedDouble>, TB, OpSize;
+                       SSEPackedDouble>, PD;
   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
-                       SSEPackedSingle>, TB;
+                       SSEPackedSingle>, PS;
   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
-                       SSEPackedDouble>, TB, OpSize;
+                       SSEPackedDouble>, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
@@ -2714,16 +2774,15 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
-                                        "movmskps", SSEPackedSingle>, TB, VEX;
+                                        "movmskps", SSEPackedSingle>, PS, VEX;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
-                                        "movmskpd", SSEPackedDouble>, TB,
-                                        OpSize, VEX;
+                                        "movmskpd", SSEPackedDouble>, PD, VEX;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, TB,
+                                        "movmskps", SSEPackedSingle>, PS,
                                         VEX, VEX_L;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
-                                        "movmskpd", SSEPackedDouble>, TB,
-                                        OpSize, VEX, VEX_L;
+                                        "movmskpd", SSEPackedDouble>, PD,
+                                        VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -2738,9 +2797,9 @@ let Predicates = [HasAVX] in {
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
-                                     SSEPackedSingle>, TB;
+                                     SSEPackedSingle>, PS;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
-                                     SSEPackedDouble>, TB, OpSize;
+                                     SSEPackedDouble>, PD;
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
@@ -2807,11 +2866,14 @@ let Predicates = [HasAVX2] in
 
 // These are ordered here for pattern ordering requirements with the fp versions
 
-defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
-defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
-defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1>;
+defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1>;
+defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1>;
 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
-                           SSE_BIT_ITINS_P, 0>;
+                           SSE_VEC_BIT_ITINS_P, 0>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Logical Instructions
@@ -2823,20 +2885,20 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
                                        SDNode OpNode, OpndItins itins> {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
               FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
-              TB, VEX_4V;
+              PS, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
         FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
-        TB, OpSize, VEX_4V;
+        PD, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
                 f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
-                TB;
+                PS;
 
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
                 f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
-                TB, OpSize;
+                PD;
   }
 }
 
@@ -2862,7 +2924,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         !strconcat(OpcodeStr, "ps"), f256mem,
         [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2870,7 +2932,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (loadv4i64 addr:$src2)))], 0>,
-                                  TB, OpSize, VEX_4V, VEX_L;
+                                  PD, VEX_4V, VEX_L;
 
   // In AVX no need to add a pattern for 128-bit logical rr ps, because they
   // are all promoted to v2i64, and the patterns are covered by the int
@@ -2879,7 +2941,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, [],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
@@ -2887,21 +2949,21 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (loadv2i64 addr:$src2)))], 0>,
-                                                 TB, OpSize, VEX_4V;
+                                                 PD, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem,
          [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
          [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                   (memopv2i64 addr:$src2)))]>, TB;
+                                   (memopv2i64 addr:$src2)))]>, PS;
 
     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
          !strconcat(OpcodeStr, "pd"), f128mem,
          [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                    (bc_v2i64 (v2f64 VR128:$src2))))],
          [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                   (memopv2i64 addr:$src2)))]>, TB, OpSize;
+                                   (memopv2i64 addr:$src2)))]>, PD;
   }
 }
 
@@ -2932,25 +2994,25 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
                                   SDNode OpNode, SizeItins itins> {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                              itins.s>, TB;
+                              itins.s>, PS;
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                              itins.d>, TB, OpSize;
+                              itins.d>, PD;
   }
 }
 
@@ -3017,6 +3079,214 @@ let isCodeGenOnly = 1 in {
              basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
 }
 
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// a scalar fp operation followed by a blend.
+//
+// These patterns know, for example, how to select an ADDSS from a
+// float add plus vector insert.
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     A[0] += B[0];
+//     return A;
+//   }
+//
+// previously we generated:
+//   addss %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+// 
+// we now generate:
+//   addss %xmm1, %xmm0
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))))),
+            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+let Predicates = [UseSSE2] in {
+  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [UseSSE41] in {
+  // If the subtarget has SSE4.1 but not AVX, the vector insert
+  // instruction is lowered into a X86insrtps rather than a X86Movss.
+  // When selecting SSE scalar single-precision fp arithmetic instructions,
+  // make sure that we correctly match the X86insrtps.
+
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                    FR32:$src))), (iPTR 0))),
+            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+let Predicates = [HasAVX] in {
+  // The following patterns select AVX Scalar single/double precision fp
+  // arithmetic instructions.
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))))),
+            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+                 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                       FR32:$src))), (iPTR 0))),
+            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// a vector packed single/double fp operation followed by a vector insert.
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     __m128 C = A + B;
+//     return (__m128) {c[0], a[1], a[2], a[3]};
+//   }
+//
+// previously we generated:
+//   addps %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+// 
+// we now generate:
+//   addss %xmm1, %xmm0
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
+                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
+                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
+  // from a packed double-precision fp instruction plus movsd.
+
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // The following patterns select AVX Scalar single/double precision fp
+  // arithmetic instructions from a packed single precision fp instruction
+  // plus movss/movsd.
+
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
@@ -3069,6 +3339,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG,
                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in
   def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
                       !strconcat("v", OpcodeStr,
@@ -3089,6 +3360,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
             Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+let isCodeGenOnly = 1 in {
   def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
@@ -3098,6 +3370,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                     [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
                 Sched<[itins.Sched.Folded]>;
 }
+}
 
 /// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
 multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3115,6 +3388,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG,
                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in
   def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
                       !strconcat("v", OpcodeStr,
@@ -3135,7 +3409,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
             Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
-  let Constraints = "$src1 = $dst" in {
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
     def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, VR128:$src2),
                       !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
@@ -3188,6 +3462,7 @@ let Predicates = [HasAVX] in {
 multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
                               Intrinsic V4F32Int, Intrinsic V8F32Int,
                               OpndItins itins> {
+let isCodeGenOnly = 1 in {
 let Predicates = [HasAVX] in {
   def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                            !strconcat("v", OpcodeStr,
@@ -3220,6 +3495,7 @@ let Predicates = [HasAVX] in {
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
                     itins.rm>, Sched<[itins.Sched.Folded]>;
+} // isCodeGenOnly = 1
 }
 
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
@@ -3238,6 +3514,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG,
                    Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in
   def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, sdmem:$src2),
                       !strconcat("v", OpcodeStr,
@@ -3256,6 +3533,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
             Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
+let isCodeGenOnly = 1 in {
   def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
@@ -3265,6 +3543,7 @@ let Predicates = [HasAVX], hasSideEffects = 0 in {
                     [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
                 Sched<[itins.Sched.Folded]>;
 }
+}
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
@@ -3455,19 +3734,14 @@ def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti{l}\t{$src, $dst|$dst, $src}",
                  [(nontemporalstore (i32 GR32:$src), addr:$dst)],
                  IIC_SSE_MOVNT>,
-               TB, Requires<[HasSSE2]>;
+               PS, Requires<[HasSSE2]>;
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti{q}\t{$src, $dst|$dst, $src}",
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)],
                      IIC_SSE_MOVNT>,
-                  TB, Requires<[HasSSE2]>;
+                  PS, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStore]
 
-def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
-
-def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
 } // AddedComplexity
 
 //===----------------------------------------------------------------------===//
@@ -3490,17 +3764,23 @@ def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
     IIC_SSE_PREFETCH>, TB;
 }
 
-// FIXME: How should these memory instructions be modeled?
+// FIXME: How should flush instruction be modeled?
 let SchedRW = [WriteLoad] in {
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
                IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
+}
 
+let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
-def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP;
+def PAUSE : I<0x90, RawFrm, (outs), (ins),  
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 
+              OBXS, Requires<[HasSSE2]>;
+}
 
+let SchedRW = [WriteFence] in {
 // Load, store, and memory fence
 def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
                "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
@@ -3557,7 +3837,8 @@ def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 }
 
 // For Disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>,
@@ -3621,7 +3902,7 @@ def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 
 // For Disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqa\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_MOVA_P_RR>;
@@ -4047,11 +4328,14 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
 //===---------------------------------------------------------------------===//
 
 defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
-                                  int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>;
+                                  int_x86_avx2_packsswb,
+                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
 defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
-                                  int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>;
+                                  int_x86_avx2_packssdw,
+                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
 defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
-                                  int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>;
+                                  int_x86_avx2_packuswb,
+                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Shuffle Instructions
@@ -4111,12 +4395,12 @@ let Predicates = [UseSSE2] in {
                 [(set VR128:$dst,
                   (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
                           (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
-           Sched<[WriteShuffleLd]>;
+           Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 }
 } // ExeDomain = SSEPackedInt
 
-defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize;
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
 
@@ -4269,7 +4553,7 @@ def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                            imm:$src2))]>, TB, OpSize, VEX,
+                                            imm:$src2))]>, PD, VEX,
                 Sched<[WriteShuffle]>;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
@@ -4280,10 +4564,10 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
 
 // Insert
 let Predicates = [HasAVX] in
-defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
 
 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
-defm PINSRW : sse2_pinsrw, TB, OpSize;
+defm PINSRW : sse2_pinsrw, PD;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4320,7 +4604,7 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 
-let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
@@ -4333,7 +4617,7 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
            IIC_SSE_MASKMOV>, VEX;
 
-let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
@@ -4434,7 +4718,7 @@ def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                                     VEX, Sched<[WriteLoad]>;
+                                     VEX, Sched<[WriteStore]>;
 def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -4444,7 +4728,7 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)],
-                                     IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
 def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
         (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
@@ -4638,17 +4922,24 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                                     IIC_SSE_MOVDQ>;
 } // SchedRW
 
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteVecLogic] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+}
+
 //===---------------------------------------------------------------------===//
 // Store / copy lower 64-bits of a XMM register.
 //
-def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}",
-                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
-                  Sched<[WriteStore]>;
-def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}",
-                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
-                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+let Predicates = [UseAVX] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+          (MOVPQI2QImr addr:$dst, VR128:$src)>;
 
 let isCodeGenOnly = 1, AddedComplexity = 20 in {
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
@@ -4745,11 +5036,11 @@ multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                       [(set RC:$dst, (vt (OpNode RC:$src)))],
-                      IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                       [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
-                      IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
+                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -4805,13 +5096,13 @@ multiclass sse3_replicate_dfp<string OpcodeStr> {
 let neverHasSideEffects = 1 in
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+                    [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
 def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (v2f64 (X86Movddup
                               (scalar_to_vector (loadf64 addr:$src)))))],
-                              IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
+                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
 }
 
 // FIXME: Merge with above classe when there're patterns for the ymm version
@@ -4819,13 +5110,13 @@ multiclass sse3_replicate_dfp_y<string OpcodeStr> {
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
-                    Sched<[WriteShuffle]>;
+                    Sched<[WriteFShuffle]>;
 def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
                       (v4f64 (X86Movddup
                               (scalar_to_vector (loadf64 addr:$src)))))]>,
-                    Sched<[WriteShuffleLd]>;
+                    Sched<[WriteLoad]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -4915,24 +5206,24 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                                 f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
+                                 f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
+                               f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                                 f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
+                                 f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
+                           f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
-                              f128mem, SSE_ALU_F32P>, TB, XD;
+                              f128mem, SSE_ALU_F32P>, XD;
   let ExeDomain = SSEPackedDouble in
   defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
-                              f128mem, SSE_ALU_F64P>, TB, OpSize;
+                              f128mem, SSE_ALU_F64P>, PD;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5019,7 +5310,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                     (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
-                    OpSize, Sched<[WriteVecALU]>;
+                    Sched<[WriteVecALU]>;
 
   def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins i128mem:$src),
@@ -5027,7 +5318,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                     [(set VR128:$dst,
                       (IntId128
                        (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
-                    OpSize, Sched<[WriteVecALULd]>;
+                    Sched<[WriteVecALULd]>;
 }
 
 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
@@ -5037,14 +5328,14 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
                     (ins VR256:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (IntId256 VR256:$src))]>,
-                    OpSize, Sched<[WriteVecALU]>;
+                    Sched<[WriteVecALU]>;
 
   def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
                     (ins i256mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst,
                       (IntId256
-                       (bitconvert (memopv4i64 addr:$src))))]>, OpSize,
+                       (bitconvert (memopv4i64 addr:$src))))]>,
                     Sched<[WriteVecALULd]>;
 }
 
@@ -5164,7 +5455,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
-       OpSize, Sched<[itins.Sched]>;
+       Sched<[itins.Sched]>;
   def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
@@ -5172,7 +5463,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
          (OpVT (OpNode RC:$src1,
-          (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize,
+          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
@@ -5187,7 +5478,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-       OpSize, Sched<[itins.Sched]>;
+       Sched<[itins.Sched]>;
   def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
@@ -5195,24 +5486,25 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize,
+          (bitconvert (memopv2i64 addr:$src2))))]>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId256> {
+                               Intrinsic IntId256,
+                               X86FoldableSchedWrite Sched> {
   let isCommutable = 1 in
   def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
        (ins VR256:$src1, VR256:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
-       OpSize;
+       Sched<[Sched]>;
   def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1,
-          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+       Sched<[Sched.Folded, ReadAfterLd]>;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
@@ -5281,16 +5573,20 @@ let isCommutable = 0 in {
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
                                   loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
-                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
+                                        int_x86_avx2_phadd_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
-                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
+                                        int_x86_avx2_phsub_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
   defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
-                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
+                                       int_x86_avx2_pmadd_ub_sw,
+                                        WriteVecIMul>, VEX_4V, VEX_L;
 }
 defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
-                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
+                                        int_x86_avx2_pmul_hr_sw,
+                                        WriteVecIMul>, VEX_4V, VEX_L;
 }
 
 // None of these have i8 immediate fields.
@@ -5338,7 +5634,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>;
+      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
@@ -5346,7 +5642,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   }
 }
 
@@ -5356,13 +5652,13 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
       (ins VR256:$src1, VR256:$src2, i8imm:$src3),
       !strconcat(asm,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-      []>, OpSize, Sched<[WriteShuffle]>;
+      []>, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
       (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
       !strconcat(asm,
                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-      []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
   }
 }
 
@@ -5426,11 +5722,11 @@ def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
                 TB, Requires<[HasSSE3]>;
 } // SchedRW
 
-def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
 
 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
-      Requires<[In32BitMode]>;
+      Requires<[Not64BitMode]>;
 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
       Requires<[In64BitMode]>;
 
@@ -5442,63 +5738,82 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
+                 Sched<[itins.Sched]>;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
          (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
-         itins.rm>, OpSize;
+         itins.rm>, Sched<[itins.Sched.Folded]>;
 }
 
 multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId> {
+                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
   def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
 
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst, (IntId (load addr:$src)))]>,
-                  OpSize;
+                  Sched<[Sched.Folded]>;
 }
 
 let Predicates = [HasAVX] in {
 defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
-                                     int_x86_sse41_pmovsxbw>, VEX;
+                                     int_x86_sse41_pmovsxbw,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
-                                     int_x86_sse41_pmovsxwd>, VEX;
+                                     int_x86_sse41_pmovsxwd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
-                                     int_x86_sse41_pmovsxdq>, VEX;
+                                     int_x86_sse41_pmovsxdq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
-                                     int_x86_sse41_pmovzxbw>, VEX;
+                                     int_x86_sse41_pmovzxbw,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
-                                     int_x86_sse41_pmovzxwd>, VEX;
+                                     int_x86_sse41_pmovzxwd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
-                                     int_x86_sse41_pmovzxdq>, VEX;
+                                     int_x86_sse41_pmovzxdq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
-                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
+                                        int_x86_avx2_pmovsxbw,
+                                        WriteShuffle>, VEX, VEX_L;
 defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
-                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
+                                        int_x86_avx2_pmovsxwd,
+                                        WriteShuffle>, VEX, VEX_L;
 defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
-                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
+                                        int_x86_avx2_pmovsxdq,
+                                        WriteShuffle>, VEX, VEX_L;
 defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
-                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
+                                        int_x86_avx2_pmovzxbw,
+                                        WriteShuffle>, VEX, VEX_L;
 defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
-                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
+                                        int_x86_avx2_pmovzxwd,
+                                        WriteShuffle>, VEX, VEX_L;
 defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
-                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
-}
-
-defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,                                       SSE_INTALU_ITINS_P>;
-defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,                                       SSE_INTALU_ITINS_P>;
-defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,                                       SSE_INTALU_ITINS_P>;
-defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,                                       SSE_INTALU_ITINS_P>;
-defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,                                       SSE_INTALU_ITINS_P>;
-defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,                                       SSE_INTALU_ITINS_P>;
+                                        int_x86_avx2_pmovzxdq,
+                                        WriteShuffle>, VEX, VEX_L;
+}
+
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,
+                                      SSE_INTALU_ITINS_SHUFF_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load.
@@ -5590,91 +5905,67 @@ let Predicates = [UseSSE41] in {
             (PMOVZXDQrm addr:$src)>;
 }
 
-let Predicates = [HasAVX2] in {
-  let AddedComplexity = 15 in {
-    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
-              (VPMOVZXDQYrr VR128:$src)>;
-    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
-              (VPMOVZXWDYrr VR128:$src)>;
-    def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))),
-              (VPMOVZXBWYrr VR128:$src)>;
-  }
-
-  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
-  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
-  def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
-}
-
-let Predicates = [HasAVX] in {
-  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
-  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
-  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
-}
-
-let Predicates = [UseSSE41] in {
-  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
-  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
-  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
-}
-
-
 multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
+                 Sched<[itins.Sched]>;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
          (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
-         itins.rm>,
-          OpSize;
+         itins.rm>, Sched<[itins.Sched.Folded]>;
 }
 
 multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId> {
+                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
   def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
 
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR256:$dst,
          (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
-          OpSize;
+         Sched<[Sched.Folded]>;
 }
 
 let Predicates = [HasAVX] in {
-defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
-                                     VEX;
-defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
-                                     VEX;
-defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
-                                     VEX;
-defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
-                                     VEX;
+defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
+defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq,
+                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
-                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
+                                       int_x86_avx2_pmovsxbd, WriteShuffle>,
+                                       VEX, VEX_L;
 defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
-                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
+                                       int_x86_avx2_pmovsxwq, WriteShuffle>,
+                                       VEX, VEX_L;
 defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
-                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
+                                       int_x86_avx2_pmovzxbd, WriteShuffle>,
+                                       VEX, VEX_L;
 defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
-                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
+                                       int_x86_avx2_pmovzxwq, WriteShuffle>,
+                                       VEX, VEX_L;
 }
 
 defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
-                                      SSE_INTALU_ITINS_P>;
+                                      SSE_INTALU_ITINS_SHUFF_P>;
 defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
-                                      SSE_INTALU_ITINS_P>;
+                                      SSE_INTALU_ITINS_SHUFF_P>;
 defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
-                                      SSE_INTALU_ITINS_P>;
+                                      SSE_INTALU_ITINS_SHUFF_P>;
 defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
-                                      SSE_INTALU_ITINS_P>;
+                                      SSE_INTALU_ITINS_SHUFF_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load
@@ -5703,49 +5994,49 @@ let Predicates = [UseSSE41] in {
 }
 
 multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               OpndItins itins = DEFAULT_ITINS> {
+                               X86FoldableSchedWrite Sched> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
 
   // Expecting a i16 load any extended to i32 value.
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst, (IntId (bitconvert
                      (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
-                 OpSize;
+                 Sched<[Sched.Folded]>;
 }
 
 multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId> {
+                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
   def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
 
   // Expecting a i16 load any extended to i32 value.
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst, (IntId (bitconvert
                       (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
-                  OpSize;
+                 Sched<[Sched.Folded]>;
 }
 
 let Predicates = [HasAVX] in {
-defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
-                                     VEX;
-defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
-                                     VEX;
+defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq,
+                                     WriteShuffle>, VEX;
+defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq,
+                                     WriteShuffle>, VEX;
 }
 let Predicates = [HasAVX2] in {
-defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
-                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
-defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
-                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
+defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq,
+                                       WriteShuffle>, VEX, VEX_L;
+defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq,
+                                       WriteShuffle>, VEX, VEX_L;
 }
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
-                                      SSE_INTALU_ITINS_P>;
+                                      WriteShuffle>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
-                                      SSE_INTALU_ITINS_P>;
+                                      WriteShuffle>;
 
 let Predicates = [HasAVX2] in {
   def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
@@ -5772,9 +6063,9 @@ let Predicates = [HasAVX2] in {
   def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
             (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
 
-  def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
+  def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
             (VPMOVSXWDYrm addr:$src)>;
-  def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
+  def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
             (VPMOVSXDQYrm addr:$src)>;
 
   def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
@@ -6003,16 +6294,15 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
                                          imm:$src2))]>,
-                 OpSize;
-  let neverHasSideEffects = 1, mayStore = 1 in
+                  Sched<[WriteShuffle]>;
+  let neverHasSideEffects = 1, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 []>, OpSize;
-// FIXME:
-// There's an AssertZext in the way of writing the store pattern
-// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
+						 imm:$src2)))), addr:$dst)]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6023,22 +6313,21 @@ defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 
 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
-  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
   def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins VR128:$src1, i32i8imm:$src2),
                    !strconcat(OpcodeStr,
                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   []>, OpSize;
+                   []>, Sched<[WriteShuffle]>;
 
-  let neverHasSideEffects = 1, mayStore = 1 in
+  let neverHasSideEffects = 1, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 []>, OpSize;
-// FIXME:
-// There's an AssertZext in the way of writing the store pattern
-// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
+						  imm:$src2)))), addr:$dst)]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6054,13 +6343,15 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32:$dst,
-                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
-                          addr:$dst)]>, OpSize;
+                          addr:$dst)]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6075,13 +6366,15 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR64:$dst,
-                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>, REX_W;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
-                          addr:$dst)]>, OpSize, REX_W;
+                          addr:$dst)]>, REX_W;
 }
 
 let Predicates = [HasAVX] in
@@ -6099,14 +6392,14 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR32orGR64:$dst,
                     (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
-                    itins.rr>,
-           OpSize;
+                    itins.rr>, Sched<[WriteFBlend]>;
+  let SchedRW = [WriteFBlendLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
-                          addr:$dst)], itins.rm>, OpSize;
+                          addr:$dst)], itins.rm>;
 }
 
 let ExeDomain = SSEPackedSingle in {
@@ -6139,7 +6432,8 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize;
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+      Sched<[WriteShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
       !if(Is2Addr,
@@ -6148,7 +6442,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
-                   imm:$src3))]>, OpSize;
+                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6165,7 +6459,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
-      OpSize;
+      Sched<[WriteShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
       !if(Is2Addr,
@@ -6174,7 +6468,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
-                          imm:$src3)))]>, OpSize;
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6191,7 +6485,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
-      OpSize;
+      Sched<[WriteShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
       !if(Is2Addr,
@@ -6200,7 +6494,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
-                          imm:$src3)))]>, OpSize;
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in
@@ -6222,7 +6516,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
-      OpSize;
+      Sched<[WriteFShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
       !if(Is2Addr,
@@ -6232,7 +6526,8 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
       [(set VR128:$dst,
         (X86insrtps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                    imm:$src3))], itins.rm>, OpSize;
+                    imm:$src3))], itins.rm>,
+      Sched<[WriteFShuffleLd, ReadAfterLd]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
@@ -6258,8 +6553,7 @@ let ExeDomain = SSEPackedSingle in {
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
-                    IIC_SSE_ROUNDPS_REG>,
-                    OpSize;
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
 
   // Vector intrinsic operation, mem
   def PSm : SS4AIi8<opcps, MRMSrcMem,
@@ -6268,8 +6562,7 @@ let ExeDomain = SSEPackedSingle in {
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
                           (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
-                          IIC_SSE_ROUNDPS_MEM>,
-                    OpSize;
+                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
 } // ExeDomain = SSEPackedSingle
 
 let ExeDomain = SSEPackedDouble in {
@@ -6279,8 +6572,7 @@ let ExeDomain = SSEPackedDouble in {
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
-                    IIC_SSE_ROUNDPS_REG>,
-                    OpSize;
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
 
   // Vector intrinsic operation, mem
   def PDm : SS4AIi8<opcpd, MRMSrcMem,
@@ -6289,8 +6581,7 @@ let ExeDomain = SSEPackedDouble in {
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
                           (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
-                          IIC_SSE_ROUNDPS_REG>,
-                    OpSize;
+                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
 } // ExeDomain = SSEPackedDouble
 }
 
@@ -6308,9 +6599,10 @@ let ExeDomain = GenericDomain in {
               "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
           !strconcat(OpcodeStr,
               "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      []>, OpSize;
+      []>, Sched<[WriteFAdd]>;
 
   // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
   def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
         !if(Is2Addr,
@@ -6319,7 +6611,7 @@ let ExeDomain = GenericDomain in {
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
-        OpSize;
+        Sched<[WriteFAdd]>;
 
   // Intrinsic operation, mem.
   def SSm : SS4AIi8<opcss, MRMSrcMem,
@@ -6331,7 +6623,7 @@ let ExeDomain = GenericDomain in {
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
              (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
-        OpSize;
+        Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Operation, reg.
   let hasSideEffects = 0 in
@@ -6342,9 +6634,10 @@ let ExeDomain = GenericDomain in {
                 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        []>, OpSize;
+        []>, Sched<[WriteFAdd]>;
 
   // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
   def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
         !if(Is2Addr,
@@ -6353,7 +6646,7 @@ let ExeDomain = GenericDomain in {
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
-        OpSize;
+        Sched<[WriteFAdd]>;
 
   // Intrinsic operation, mem.
   def SDm : SS4AIi8<opcsd, MRMSrcMem,
@@ -6365,7 +6658,7 @@ let ExeDomain = GenericDomain in {
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
               (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
-        OpSize;
+        Sched<[WriteFAddLd, ReadAfterLd]>;
 } // ExeDomain = GenericDomain
 }
 
@@ -6512,31 +6805,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-                OpSize, VEX;
+                Sched<[WriteVecLogic]>, VEX;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
-                OpSize, VEX;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                OpSize, VEX, VEX_L;
+                Sched<[WriteVecLogic]>, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
-                OpSize, VEX, VEX_L;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
 }
 
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
               "ptest\t{$src2, $src1|$src1, $src2}",
               [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-              OpSize;
+              Sched<[WriteVecLogic]>;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
               "ptest\t{$src2, $src1|$src1, $src2}",
               [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
-              OpSize;
+              Sched<[WriteVecLogicLd, ReadAfterLd]>;
 }
 
 // The bit test instructions below are AVX only
@@ -6544,11 +6837,12 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
                        X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
   def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
-            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
+            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+            Sched<[WriteVecLogic]>, VEX;
   def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
             !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
             [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
-            OpSize, VEX;
+            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
@@ -6572,56 +6866,65 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
-                     IIC_SSE_POPCNT_RR>,
-                     OpSize, XS;
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize16, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
-                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize16, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
-                     IIC_SSE_POPCNT_RR>,
-                     XS;
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize32, XS;
+
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
-                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize32, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
-                      IIC_SSE_POPCNT_RR>,
-                      XS;
+                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
-                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                       Sched<[WriteFAddLd]>, XS;
 }
 
 
 
 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
-                                 Intrinsic IntId128> {
+                                 Intrinsic IntId128,
+                                 X86FoldableSchedWrite Sched> {
   def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    Sched<[Sched]>;
   def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                      (ins i128mem:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set VR128:$dst,
-                       (IntId128
-                        (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
+                       (IntId128 (bitconvert (memopv2i64 addr:$src))))]>,
+                    Sched<[Sched.Folded]>;
 }
 
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
-                                         int_x86_sse41_phminposuw>, VEX;
+                                         int_x86_sse41_phminposuw,
+                                         WriteVecIMul>, VEX;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
-                                         int_x86_sse41_phminposuw>;
+                                         int_x86_sse41_phminposuw,
+                                         WriteVecIMul>;
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
@@ -6634,32 +6937,33 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
-        itins.rr>, OpSize;
+       itins.rr>, Sched<[itins.Sched]>;
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1,
-          (bitconvert (memopv2i64 addr:$src2))))],
-          itins.rm>, OpSize;
+         (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))],
+       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 /// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
-                                Intrinsic IntId256> {
+                                Intrinsic IntId256,
+                                X86FoldableSchedWrite Sched> {
   let isCommutable = 1 in
   def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
        (ins VR256:$src1, VR256:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize;
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+       Sched<[Sched]>;
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1,
-          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+       Sched<[Sched.Folded, ReadAfterLd]>;
 }
 
 
@@ -6667,75 +6971,95 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
                           X86MemOperand x86memop, bit Is2Addr = 1,
-                          OpndItins itins = DEFAULT_ITINS> {
+                          OpndItins itins = SSE_INTALU_ITINS_P> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[itins.Sched]>;
   def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1,
-          (bitconvert (memop_frag addr:$src2)))))]>, OpSize;
+         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
-                                                         0>, VEX_4V;
+                                      0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V;
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
-                                  loadv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
   defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
-                                                         0>, VEX_4V;
+                                      0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
-                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
+                                        int_x86_avx2_packusdw, WriteShuffle>,
+                                        VEX_4V, VEX_L;
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
-                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
+                                        int_x86_avx2_pmul_dq, WriteVecIMul>,
+                                        VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in
-  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
+  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw,
+                                     1, DEFAULT_ITINS_SHUFFLESCHED>;
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
@@ -6758,15 +7082,19 @@ let Constraints = "$src1 = $dst" in {
 
 let Predicates = [HasAVX] in {
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem, 0>, VEX_4V;
+                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 VEX_4V;
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
-                                 memopv2i64, i128mem, 0>, VEX_4V;
+                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 VEX_4V;
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6790,7 +7118,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
-        OpSize;
+        Sched<[itins.Sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
         !if(Is2Addr,
@@ -6801,47 +7129,58 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
         [(set RC:$dst,
           (IntId RC:$src1,
            (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
-        OpSize;
+        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, loadv4f32, f128mem, 0>, VEX_4V;
+                                        VR128, loadv4f32, f128mem, 0,
+                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
                                     int_x86_avx_blend_ps_256, VR256, loadv8f32,
-                                    f256mem, 0>, VEX_4V, VEX_L;
+                                    f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
+                                    VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, loadv2f64, f128mem, 0>, VEX_4V;
+                                        VR128, loadv2f64, f128mem, 0,
+                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
                                      int_x86_avx_blend_pd_256,VR256, loadv4f64,
-                                     f256mem, 0>, VEX_4V, VEX_L;
+                                     f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
+                                     VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
-                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0,
+                                      DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0,
+                                      DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, loadv4f32, f128mem, 0>, VEX_4V;
+                                   VR128, loadv4f32, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, loadv2f64, f128mem, 0>, VEX_4V;
+                                   VR128, loadv2f64, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                  VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L;
+                                    VR256, loadv8f32, i256mem, 0,
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
   }
 }
 
@@ -6850,17 +7189,17 @@ let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
                                      VR128, memopv4f32, f128mem,
-                                     1, SSE_INTALU_ITINS_P>;
+                                     1, SSE_INTALU_ITINS_FBLEND_P>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
                                      VR128, memopv2f64, f128mem,
-                                     1, SSE_INTALU_ITINS_P>;
+                                     1, SSE_INTALU_ITINS_FBLEND_P>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
                                      VR128, memopv2i64, i128mem,
-                                     1, SSE_INTALU_ITINS_P>;
+                                     1, SSE_INTALU_ITINS_FBLEND_P>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
                                      VR128, memopv2i64, i128mem,
-                                     1, SSE_INTMUL_ITINS_P>;
+                                     1, SSE_MPSADBW_ITINS>;
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
@@ -6875,13 +7214,15 @@ let Constraints = "$src1 = $dst" in {
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
-                                    PatFrag mem_frag, Intrinsic IntId> {
+                                    PatFrag mem_frag, Intrinsic IntId,
+                                    X86FoldableSchedWrite Sched> {
   def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
                   (ins RC:$src1, RC:$src2, RC:$src3),
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
-                  NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+                Sched<[Sched]>;
 
   def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
                   (ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -6890,29 +7231,36 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                   [(set RC:$dst,
                         (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
                                RC:$src3))],
-                  NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+                Sched<[Sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           loadv2f64, int_x86_sse41_blendvpd>;
+                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           WriteFVarBlend>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                  loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
+                                  loadv4f64, int_x86_avx_blendv_pd_256,
+                                  WriteFVarBlend>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           loadv4f32, int_x86_sse41_blendvps>;
+                                           loadv4f32, int_x86_sse41_blendvps,
+                                           WriteFVarBlend>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                  loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
+                                  loadv8f32, int_x86_avx_blendv_ps_256,
+                                  WriteFVarBlend>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           loadv2i64, int_x86_sse41_pblendvb>;
+                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           WriteVarBlend>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      loadv4i64, int_x86_avx2_pblendvb>, VEX_L;
+                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      WriteVarBlend>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6981,7 +7329,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
-                    itins.rr>, OpSize;
+                    itins.rr>;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
@@ -6990,7 +7338,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (mem_frag addr:$src2)), XMM0))],
-                       itins.rm>, OpSize;
+                       itins.rm>;
   }
 }
 
@@ -7050,16 +7398,15 @@ let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
-                       OpSize, VEX;
+                       VEX;
 let Predicates = [HasAVX2] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         OpSize, VEX, VEX_L;
+                         VEX, VEX_L;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
-                       OpSize;
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - Compare Instructions
@@ -7074,15 +7421,14 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
-       OpSize;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
   def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize;
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
 }
 
 let Predicates = [HasAVX] in
@@ -7122,12 +7468,12 @@ multiclass pcmpistrm_SS42AI<string asm> {
   def rr : SS42AI<0x62, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src2, i8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpIStrM]>;
   let mayLoad = 1 in
   def rm :SS42AI<0x62, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
 }
 
 let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
@@ -7157,12 +7503,12 @@ multiclass SS42AI_pcmpestrm<string asm> {
   def rr : SS42AI<0x60, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src3, i8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpEStrM]>;
   let mayLoad = 1 in
   def rm : SS42AI<0x60, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
 }
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
@@ -7192,12 +7538,12 @@ multiclass SS42AI_pcmpistri<string asm> {
   def rr : SS42AI<0x63, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src2, i8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpIStrI]>;
   let mayLoad = 1 in
   def rm : SS42AI<0x63, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
     !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
 }
 
 let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
@@ -7228,12 +7574,12 @@ multiclass SS42AI_pcmpestri<string asm> {
   def rr : SS42AI<0x61, MRMSrcReg, (outs),
     (ins VR128:$src1, VR128:$src3, i8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpEStrI]>;
   let mayLoad = 1 in
   def rm : SS42AI<0x61, MRMSrcMem, (outs),
     (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
     !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-    []>, OpSize;
+    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
 }
 
 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
@@ -7255,14 +7601,15 @@ class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
                    RegisterClass RCIn, SDPatternOperator Int> :
   SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
-         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>;
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
+         Sched<[WriteFAdd]>;
 
 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
                    X86MemOperand x86memop, SDPatternOperator Int> :
   SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
          [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
-         IIC_CRC32_MEM>;
+         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
   def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
@@ -7270,13 +7617,13 @@ let Constraints = "$src1 = $dst" in {
   def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
                                  int_x86_sse42_crc32_32_8>;
   def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
-                                 int_x86_sse42_crc32_32_16>, OpSize;
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
   def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
-                                 int_x86_sse42_crc32_32_16>, OpSize;
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
   def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
-                                 int_x86_sse42_crc32_32_32>;
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
   def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
-                                 int_x86_sse42_crc32_32_32>;
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
   def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
                                  int_x86_sse42_crc32_64_64>, REX_W;
   def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
@@ -7357,14 +7704,15 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
-       OpSize;
+       Sched<[WriteAESDecEnc]>;
   def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
+         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
+       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
 }
 
 // Perform One Round of an AES Encryption/Decryption Flow
@@ -7396,25 +7744,24 @@ let Predicates = [HasAVX, HasAES] in {
       (ins VR128:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst,
-        (int_x86_aesni_aesimc VR128:$src1))]>,
-      OpSize, VEX;
+        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+      VEX;
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
-      OpSize, VEX;
+      Sched<[WriteAESIMCLd]>, VEX;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
   [(set VR128:$dst,
-    (int_x86_aesni_aesimc VR128:$src1))]>,
-  OpSize;
+    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
   [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
-  OpSize;
+  Sched<[WriteAESIMCLd]>;
 
 // AES Round Key Generation Assist
 let Predicates = [HasAVX, HasAES] in {
@@ -7423,26 +7770,26 @@ let Predicates = [HasAVX, HasAES] in {
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
-      OpSize, VEX;
+      Sched<[WriteAESKeyGen]>, VEX;
   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1, i8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
-      OpSize, VEX;
+      Sched<[WriteAESKeyGenLd]>, VEX;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, i8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
     (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
-  OpSize;
+  Sched<[WriteAESKeyGen]>;
 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1, i8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
     (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
-  OpSize;
+  Sched<[WriteAESKeyGenLd]>;
 
 //===----------------------------------------------------------------------===//
 // PCLMUL Instructions
@@ -7453,13 +7800,15 @@ def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst,
-             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+           Sched<[WriteCLMul]>;
 
 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (loadv2i64 addr:$src2), imm:$src3))]>;
+                              (loadv2i64 addr:$src2), imm:$src3))]>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -7468,14 +7817,15 @@ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst,
              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
-             IIC_SSE_PCLMULQDQ_RR>;
+             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
 
 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
                               (memopv2i64 addr:$src2), imm:$src3))],
-                              IIC_SSE_PCLMULQDQ_RM>;
+                              IIC_SSE_PCLMULQDQ_RM>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
 } // Constraints = "$src1 = $dst"
 
 
@@ -7506,16 +7856,16 @@ defm : pclmul_alias<"lqlq", 0x00>;
 let Predicates = [HasSSE4A] in {
 
 let Constraints = "$src = $dst" in {
-def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  (ins VR128:$src, i8imm:$len, i8imm:$idx),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
                  [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
-                                    imm:$idx))]>, TB, OpSize;
+                                    imm:$idx))]>, PD;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
               "extrq\t{$mask, $src|$src, $mask}",
               [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
-                                 VR128:$mask))]>, TB, OpSize;
+                                 VR128:$mask))]>, PD;
 
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
@@ -7547,43 +7897,50 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
 //              destination operand
 //
 class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                    X86MemOperand x86memop, Intrinsic Int> :
+                    X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-        [(set RC:$dst, (Int addr:$src))]>, VEX;
+        [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
 
 // AVX2 adds register forms
 class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                         Intrinsic Int> :
+                         Intrinsic Int, SchedWrite Sched> :
   AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-         [(set RC:$dst, (Int VR128:$src))]>, VEX;
+         [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
 
 let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
-                                      int_x86_avx_vbroadcast_ss>;
+                                      int_x86_avx_vbroadcast_ss, WriteLoad>;
   def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
+                                      int_x86_avx_vbroadcast_ss_256,
+                                      WriteFShuffleLd>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
-                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
+                                    int_x86_avx_vbroadcast_sd_256,
+                                    WriteFShuffleLd>, VEX_L;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
-                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
+                                   int_x86_avx_vbroadcastf128_pd_256,
+                                   WriteFShuffleLd>, VEX_L;
 
 let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
-                                           int_x86_avx2_vbroadcast_ss_ps>;
+                                           int_x86_avx2_vbroadcast_ss_ps,
+                                           WriteFShuffle>;
   def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
-                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
+                                      int_x86_avx2_vbroadcast_ss_ps_256,
+                                      WriteFShuffle256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
+                                      int_x86_avx2_vbroadcast_sd_pd_256,
+                                      WriteFShuffle256>, VEX_L;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
-                                   int_x86_avx2_vbroadcasti128>, VEX_L;
+                                   int_x86_avx2_vbroadcasti128, WriteLoad>,
+                                   VEX_L;
 
 let Predicates = [HasAVX] in
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
@@ -7597,12 +7954,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7671,12 +8028,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX, VEX_L;
+          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX, VEX_L;
+          []>, Sched<[WriteStore]>, VEX, VEX_L;
 }
 
 // AVX1 patterns
@@ -7785,22 +8142,26 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
   def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
+             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
+             Sched<[WriteFShuffle]>;
   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
              (ins RC:$src1, x86memop_i:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, (IntVar RC:$src1,
-                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V;
+                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
 
   def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, i8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX;
+             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffle]>;
   def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
              (ins x86memop_f:$src1, i8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
-               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX;
+               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffleLd]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
@@ -7841,12 +8202,14 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffle]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffleLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -7888,11 +8251,11 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7902,10 +8265,11 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (Int VR128:$src))]>,
-             T8, OpSize, VEX;
+             T8PD, VEX, Sched<[WriteCvtF2F]>;
   let neverHasSideEffects = 1, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
+             Sched<[WriteCvtF2FLd]>;
 }
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
@@ -7913,12 +8277,13 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
                (ins RC:$src1, i32i8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
-               TA, OpSize, VEX;
-  let neverHasSideEffects = 1, mayStore = 1 in
+               TAPD, VEX, Sched<[WriteCvtF2F]>;
+  let neverHasSideEffects = 1, mayStore = 1,
+      SchedRW = [WriteCvtF2FLd, WriteRMW] in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               TA, OpSize, VEX;
+               TAPD, VEX;
 }
 
 let Predicates = [HasF16C] in {
@@ -7942,7 +8307,7 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
-        VEX_4V;
+        Sched<[WriteBlend]>, VEX_4V;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
         !strconcat(OpcodeStr,
@@ -7950,7 +8315,7 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
         [(set RC:$dst,
           (IntId RC:$src1,
            (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
-        VEX_4V;
+        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
 }
 
 let isCommutable = 0 in {
@@ -7976,19 +8341,22 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
                           Intrinsic Int128, Intrinsic Int256> {
   def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR128:$dst, (Int128 VR128:$src))]>, VEX;
+                  [(set VR128:$dst, (Int128 VR128:$src))]>,
+                  Sched<[WriteShuffle]>, VEX;
   def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
+                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
+                  Sched<[WriteLoad]>, VEX;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
+                   [(set VR256:$dst, (Int256 VR128:$src))]>,
+                   Sched<[WriteShuffle256]>, VEX, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
                     (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
-                   VEX, VEX_L;
+                   Sched<[WriteLoad]>, VEX, VEX_L;
 }
 
 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
@@ -8063,6 +8431,31 @@ let Predicates = [HasAVX2] in {
               (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
     def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
               (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                           VR128))>;
+    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                            VR128))>;
+
+    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                           VR128))>;
+    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                            VR128))>;
+
+    // The patterns for VPBROADCASTD are not needed because they would match
+    // the exact same thing as VBROADCASTSS patterns.
+
+    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
   }
 }
 
@@ -8124,7 +8517,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
-                   VEX_4V, VEX_L;
+                   Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
@@ -8132,7 +8525,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1,
                             (bitconvert (mem_frag addr:$src2)))))]>,
-                   VEX_4V, VEX_L;
+                   Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
 defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
@@ -8147,14 +8540,15 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
-                     VEX, VEX_L;
+                     Sched<[WriteShuffle256]>, VEX, VEX_L;
   def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins i256mem:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>, VEX, VEX_L;
+                              (i8 imm:$src2))))]>,
+                     Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L;
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
@@ -8168,12 +8562,14 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
+                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+          VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
+                             (i8 imm:$src3)))]>,
+          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
 
 let Predicates = [HasAVX2] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -8202,12 +8598,12 @@ let neverHasSideEffects = 1 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V, VEX_L;
+          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V, VEX_L;
+          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
@@ -8257,12 +8653,12 @@ def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128:$dst,
             (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
-          VEX, VEX_L;
+          Sched<[WriteShuffle256]>, VEX, VEX_L;
 let neverHasSideEffects = 1, mayStore = 1 in
 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-          VEX, VEX_L;
+          Sched<[WriteStore]>, VEX, VEX_L;
 
 let Predicates = [HasAVX2] in {
 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
@@ -8347,27 +8743,27 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
-             VEX_4V;
+             VEX_4V, Sched<[WriteVarVecShift]>;
   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
                        (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
-             VEX_4V;
+             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V, VEX_L;
+             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
-             VEX_4V, VEX_L;
+             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
 }
 
 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
@@ -8395,12 +8791,18 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
 let mayLoad = 1, Constraints
   = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
   in {
-  defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
-  defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
-  defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
-  defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
   defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
   defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
   defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
   defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+
+  let ExeDomain = SSEPackedDouble in {
+    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+  }
 }
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index 0191c01..c847be7 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -31,7 +31,7 @@ def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
 // 0F 01 D8
 let Uses = [EAX] in
 def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
-                "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
+                "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
 let Uses = [RAX] in
 def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
                 "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
@@ -39,7 +39,7 @@ def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
 // 0F 01 DA
 let Uses = [EAX] in
 def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
-                "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
+                "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
 let Uses = [RAX] in
 def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
                 "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
@@ -47,7 +47,7 @@ def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
 // 0F 01 DB
 let Uses = [EAX] in
 def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
-                "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
+                "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
 let Uses = [RAX] in
 def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
                 "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
@@ -55,7 +55,7 @@ def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
 // 0F 01 DF
 let Uses = [EAX, ECX] in
 def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>;
+                "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
 let Uses = [RAX, ECX] in
 def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
                 "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 1937770..d0bb523 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -22,10 +22,10 @@ def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
                  [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
 def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
                  "shl{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
+                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
 def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
                  "shl{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
+                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
 def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                   "shl{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
@@ -39,10 +39,11 @@ let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "shl{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
-                   OpSize;
+                   OpSize16;
 def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "shl{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>;
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize32;
 def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
                     (ins GR64:$src1, i8imm:$src2),
                     "shl{q}\t{$src2, $dst|$dst, $src2}",
@@ -55,9 +56,9 @@ let hasSideEffects = 0 in {
 def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
                  "shl{b}\t$dst", [], IIC_SR>;
 def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
-                 "shl{w}\t$dst", [], IIC_SR>, OpSize;
+                 "shl{w}\t$dst", [], IIC_SR>, OpSize16;
 def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
-                 "shl{l}\t$dst", [], IIC_SR>;
+                 "shl{l}\t$dst", [], IIC_SR>, OpSize32;
 def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
                  "shl{q}\t$dst", [], IIC_SR>;
 } // hasSideEffects = 0
@@ -75,10 +76,11 @@ def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
 def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
                  "shl{w}\t{%cl, $dst|$dst, cl}",
                  [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
-                 OpSize;
+                 OpSize16;
 def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
                  "shl{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
+                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
 def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
                   "shl{q}\t{%cl, $dst|$dst, cl}",
                   [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
@@ -90,12 +92,11 @@ def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
 def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
                    "shl{w}\t{$src, $dst|$dst, $src}",
                [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>,
-                   OpSize;
+               IIC_SR>, OpSize16;
 def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
                    "shl{l}\t{$src, $dst|$dst, $src}",
                [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>;
+               IIC_SR>, OpSize32;
 def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
                   "shl{q}\t{$src, $dst|$dst, $src}",
                  [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
@@ -109,12 +110,11 @@ def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
 def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
                  "shl{w}\t$dst",
                [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>,
-                   OpSize;
+               IIC_SR>, OpSize16;
 def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
                  "shl{l}\t$dst",
                [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>;
+               IIC_SR>, OpSize32;
 def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
                   "shl{q}\t$dst",
                  [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
@@ -128,10 +128,10 @@ def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
                  [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
 def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
                  "shr{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
+                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
 def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
                  "shr{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
+                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
 def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                   "shr{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
@@ -143,11 +143,11 @@ def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
 def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "shr{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, OpSize;
+                   IIC_SR>, OpSize16;
 def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "shr{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>;
+                   IIC_SR>, OpSize32;
 def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
                   "shr{q}\t{$src2, $dst|$dst, $src2}",
                   [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
@@ -158,10 +158,10 @@ def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
                  [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
 def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
                  "shr{w}\t$dst",
-                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize;
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
 def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
                  "shr{l}\t$dst",
-                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>;
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
 def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
                  "shr{q}\t$dst",
                  [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
@@ -176,10 +176,11 @@ def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
 def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
                  "shr{w}\t{%cl, $dst|$dst, cl}",
                  [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
-                 OpSize;
+                 OpSize16;
 def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
                  "shr{l}\t{%cl, $dst|$dst, cl}",
-                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
+                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
 def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
                   "shr{q}\t{%cl, $dst|$dst, cl}",
                   [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
@@ -191,12 +192,11 @@ def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
 def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
                    "shr{w}\t{$src, $dst|$dst, $src}",
                [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>,
-                   OpSize;
+               IIC_SR>, OpSize16;
 def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
                    "shr{l}\t{$src, $dst|$dst, $src}",
                [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>;
+               IIC_SR>, OpSize32;
 def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
                   "shr{q}\t{$src, $dst|$dst, $src}",
                  [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
@@ -210,11 +210,11 @@ def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
 def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
                  "shr{w}\t$dst",
                [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>,OpSize;
+               IIC_SR>, OpSize16;
 def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
                  "shr{l}\t$dst",
                [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>;
+               IIC_SR>, OpSize32;
 def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
                   "shr{q}\t$dst",
                  [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
@@ -230,11 +230,11 @@ def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
 def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
                  "sar{w}\t{%cl, $dst|$dst, cl}",
                  [(set GR16:$dst, (sra GR16:$src1, CL))],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
                  "sar{l}\t{%cl, $dst|$dst, cl}",
                  [(set GR32:$dst, (sra GR32:$src1, CL))],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
                  "sar{q}\t{%cl, $dst|$dst, cl}",
                  [(set GR64:$dst, (sra GR64:$src1, CL))],
@@ -248,12 +248,11 @@ def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
 def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "sar{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>,
-                   OpSize;
+                   IIC_SR>, OpSize16;
 def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "sar{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>;
+                   IIC_SR>, OpSize32;
 def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
                     (ins GR64:$src1, i8imm:$src2),
                     "sar{q}\t{$src2, $dst|$dst, $src2}",
@@ -268,11 +267,11 @@ def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
 def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
                  "sar{w}\t$dst",
                  [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
                  "sar{l}\t$dst",
                  [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
                  "sar{q}\t$dst",
                  [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
@@ -289,11 +288,11 @@ def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
 def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
                  "sar{w}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
                  "sar{l}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
                  "sar{q}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
@@ -306,12 +305,11 @@ def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
 def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
                    "sar{w}\t{$src, $dst|$dst, $src}",
                [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>,
-                   OpSize;
+               IIC_SR>, OpSize16;
 def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
                    "sar{l}\t{$src, $dst|$dst, $src}",
                [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-               IIC_SR>;
+               IIC_SR>, OpSize32;
 def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
                     "sar{q}\t{$src, $dst|$dst, $src}",
                  [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
@@ -325,12 +323,11 @@ def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
 def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
                  "sar{w}\t$dst",
                [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>,
-                   OpSize;
+               IIC_SR>, OpSize16;
 def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
                  "sar{l}\t$dst",
                [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-               IIC_SR>;
+               IIC_SR>, OpSize32;
 def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
                   "sar{q}\t$dst",
                  [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
@@ -352,20 +349,20 @@ def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                 "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
   
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                "rcl{w}\t$dst", [], IIC_SR>, OpSize;
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
-                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 let Uses = [CL] in
 def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
 
 def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                "rcl{l}\t$dst", [], IIC_SR>;
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
-                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 let Uses = [CL] in
 def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
 
 
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
@@ -386,20 +383,20 @@ def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                 "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
   
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                "rcr{w}\t$dst", [], IIC_SR>, OpSize;
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
-                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 let Uses = [CL] in
 def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
 
 def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                "rcr{l}\t$dst", [], IIC_SR>;
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
-                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 let Uses = [CL] in
 def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
                  
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcr{q}\t$dst", [], IIC_SR>;
@@ -417,13 +414,13 @@ def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
 def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
-                "rcl{w}\t$dst", [], IIC_SR>, OpSize;
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
-                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
-                "rcl{l}\t$dst", [], IIC_SR>;
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
-                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
                  "rcl{q}\t$dst", [], IIC_SR>;
 def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
@@ -434,13 +431,13 @@ def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
 def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
-                "rcr{w}\t$dst", [], IIC_SR>, OpSize;
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
-                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
 def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
-                "rcr{l}\t$dst", [], IIC_SR>;
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
-                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
 def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
                  "rcr{q}\t$dst", [], IIC_SR>;
 def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
@@ -450,18 +447,18 @@ let Uses = [CL] in {
 def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
                 "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
-                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
 def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
-                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
 def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
                   "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 
 def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
                 "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
-                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
 def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
-                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
 def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
                   "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 }
@@ -476,10 +473,10 @@ def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
                  [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
 def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                  "rol{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
 def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                  "rol{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
 def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   "rol{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
@@ -491,12 +488,11 @@ def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
 def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "rol{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, 
-                   OpSize;
+                   IIC_SR>, OpSize16;
 def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "rol{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>;
+                   IIC_SR>, OpSize32;
 def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), 
                     (ins GR64:$src1, i8imm:$src2),
                     "rol{q}\t{$src2, $dst|$dst, $src2}",
@@ -511,11 +507,11 @@ def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
 def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
                  "rol{w}\t$dst",
                  [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
                  "rol{l}\t$dst",
                  [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
                   "rol{q}\t$dst",
                   [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
@@ -531,11 +527,11 @@ def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
 def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
                  "rol{w}\t{%cl, $dst|$dst, cl}",
                  [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
                  "rol{l}\t{%cl, $dst|$dst, cl}",
                  [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
                    "rol{q}\t{%cl, $dst|$dst, cl}",
                    [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
@@ -548,12 +544,11 @@ def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
 def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
                    "rol{w}\t{$src1, $dst|$dst, $src1}",
               [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
-              IIC_SR>,
-                   OpSize;
+              IIC_SR>, OpSize16;
 def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
                    "rol{l}\t{$src1, $dst|$dst, $src1}",
               [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
-              IIC_SR>;
+              IIC_SR>, OpSize32;
 def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
                     "rol{q}\t{$src1, $dst|$dst, $src1}",
                 [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
@@ -567,12 +562,11 @@ def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
 def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
                  "rol{w}\t$dst",
               [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-              IIC_SR>,
-                   OpSize;
+              IIC_SR>, OpSize16;
 def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
                  "rol{l}\t$dst",
               [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-              IIC_SR>;
+              IIC_SR>, OpSize32;
 def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
                  "rol{q}\t$dst",
                [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
@@ -586,10 +580,10 @@ def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
                  [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
 def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                  "ror{w}\t{%cl, $dst|$dst, cl}",
-                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
 def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t{%cl, $dst|$dst, cl}",
-                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
 def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
@@ -601,12 +595,11 @@ def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
 def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "ror{w}\t{$src2, $dst|$dst, $src2}",
                    [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
-                   IIC_SR>, 
-                   OpSize;
+                   IIC_SR>, OpSize16;
 def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "ror{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
-                   IIC_SR>;
+                   IIC_SR>, OpSize32;
 def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), 
                     (ins GR64:$src1, i8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
@@ -621,11 +614,11 @@ def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
 def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
                  "ror{w}\t$dst",
                  [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                  "ror{l}\t$dst",
                  [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
                   "ror{q}\t$dst",
                   [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
@@ -641,11 +634,11 @@ def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
 def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>, OpSize;
+                 IIC_SR>, OpSize16;
 def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
                  "ror{l}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
-                 IIC_SR>;
+                 IIC_SR>, OpSize32;
 def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
                   "ror{q}\t{%cl, $dst|$dst, cl}",
                   [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
@@ -658,12 +651,11 @@ def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
 def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
                    "ror{w}\t{$src, $dst|$dst, $src}",
               [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
-              IIC_SR>,
-                   OpSize;
+              IIC_SR>, OpSize16;
 def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
                    "ror{l}\t{$src, $dst|$dst, $src}",
               [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
-              IIC_SR>;
+              IIC_SR>, OpSize32;
 def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
                     "ror{q}\t{$src, $dst|$dst, $src}",
                 [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
@@ -677,12 +669,11 @@ def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
 def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t$dst",
               [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
-              IIC_SR>,
-                   OpSize;
+              IIC_SR>, OpSize16;
 def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t$dst",
               [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
-              IIC_SR>;
+              IIC_SR>, OpSize32;
 def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                  "ror{q}\t$dst",
                [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
@@ -702,23 +693,23 @@ def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
                     IIC_SHD16_REG_CL>,
-                   TB, OpSize;
+                   TB, OpSize16;
 def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), 
                    (ins GR16:$src1, GR16:$src2),
                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
                     IIC_SHD16_REG_CL>,
-                   TB, OpSize;
+                   TB, OpSize16;
 def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), 
                    (ins GR32:$src1, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
-                    IIC_SHD32_REG_CL>, TB;
+                    IIC_SHD32_REG_CL>, TB, OpSize32;
 def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
-                   IIC_SHD32_REG_CL>, TB;
+                   IIC_SHD32_REG_CL>, TB, OpSize32;
 def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
@@ -740,28 +731,28 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
-                     TB, OpSize;
+                     TB, OpSize16;
 def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR16:$dst), 
                      (ins GR16:$src1, GR16:$src2, i8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
-                     TB, OpSize;
+                     TB, OpSize16;
 def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR32:$dst), 
                      (ins GR32:$src1, GR32:$src2, i8imm:$src3),
                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
-                 TB;
+                 TB, OpSize32;
 def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR32:$dst), 
                      (ins GR32:$src1, GR32:$src2, i8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
-                 TB;
+                 TB, OpSize32;
 def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
                       (outs GR64:$dst), 
                       (ins GR64:$src1, GR64:$src2, i8imm:$src3),
@@ -784,20 +775,20 @@ let Uses = [CL] in {
 def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
-                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
+                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
 def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
-                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
+                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
 
 def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
-                     addr:$dst)], IIC_SHD32_MEM_CL>, TB;
+                     addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
 def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
-                    addr:$dst)], IIC_SHD32_MEM_CL>, TB;
+                    addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
                     
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
@@ -815,14 +806,14 @@ def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD16_MEM_IM>,
-                    TB, OpSize;
+                    TB, OpSize16;
 def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
                      (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD16_MEM_IM>,
-                     TB, OpSize;
+                     TB, OpSize16;
 
 def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
@@ -830,14 +821,14 @@ def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
                                       (i8 imm:$src3)), addr:$dst)],
                                       IIC_SHD32_MEM_IM>,
-                    TB;
+                    TB, OpSize32;
 def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
                      (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
                                        (i8 imm:$src3)), addr:$dst)],
                                        IIC_SHD32_MEM_IM>,
-                     TB;
+                     TB, OpSize32;
 
 def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
@@ -905,8 +896,8 @@ let Predicates = [HasBMI2] in {
   defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
   defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
   defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
-  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize;
-  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
 
   // Prefer RORX which is non-destructive and doesn't update EFLAGS.
   let AddedComplexity = 10 in {
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 2196dc3..9d3aa1c 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -61,11 +61,12 @@ def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
 
 def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
                  IIC_SYS_ENTER_EXIT>, TB;
-def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
-                Requires<[In64BitMode]>;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
+                 IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
 
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
-def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>,
+             OpSize32;
 def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
              Requires<[In64BitMode]>;
 } // SchedRW
@@ -80,44 +81,41 @@ def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
                "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
 let Defs = [AX], Uses = [DX] in
 def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize;
+               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize16;
 let Defs = [EAX], Uses = [DX] in
 def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>;
+               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
                   "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize;
+                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>;
+                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
 
 let Uses = [DX, AL] in
 def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
                 "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
 let Uses = [DX, AX] in
 def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize;
+                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
 let Uses = [DX, EAX] in
 def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>;
+                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
                    "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize;
+                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>;
+                  "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
 
-def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
-def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>,  OpSize;
-def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -125,14 +123,18 @@ def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
 
 let SchedRW = [WriteSystem] in {
 def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[Not64BitMode]>;
 def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
-                
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[In64BitMode]>;
+
 def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[Not64BitMode]>;
 def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -140,14 +142,18 @@ def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
 
 let SchedRW = [WriteSystem] in {
 def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[Not64BitMode]>;
 def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
-                
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[In64BitMode]>;
+
 def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[Not64BitMode]>;
 def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -167,30 +173,30 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
 
 let SchedRW = [WriteMove] in {
 def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
 def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
 
 def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
 def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
 def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
 
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
 def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
 def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
 
 def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
 def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
 def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
 } // SchedRW
@@ -202,15 +208,19 @@ let SchedRW = [WriteSystem] in {
 def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
 
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
-                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize;
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize16;
 def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize;
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize16;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
-                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize32;
 def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize32;
 // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
@@ -218,13 +228,17 @@ def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize; 
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize16;
 def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize;
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize16;
 def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize32;
 def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize32;
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
 def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
@@ -234,9 +248,9 @@ def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
                [], IIC_INVLPG>, TB;
 
 def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
-               "str{w}\t$dst", [], IIC_STR>, TB, OpSize;
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
 def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
-               "str{l}\t$dst", [], IIC_STR>, TB;
+               "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
                 "str{q}\t$dst", [], IIC_STR>, TB;
 def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
@@ -248,105 +262,115 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
              
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
-               OpSize;
+                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
 def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
+                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+                 OpSize32, Requires<[Not64BitMode]>;
 def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
-               OpSize;
+                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
 def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
 def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
-               OpSize;
+                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
 def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
 def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
-               OpSize;
+                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
 def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
-                 
+                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
 def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB;
+                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
 def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
 def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB;
+                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
 def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
-
+                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
 def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB;
+                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
 def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB;
+                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
 
 // No "pop cs" instruction.
 def POPSS16 : I<0x17, RawFrm, (outs), (ins),
                 "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
-              OpSize, Requires<[In32BitMode]>;
+              OpSize16, Requires<[Not64BitMode]>;
 def POPSS32 : I<0x17, RawFrm, (outs), (ins),
                 "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
-                      Requires<[In32BitMode]>;
-                
+              OpSize32, Requires<[Not64BitMode]>;
+
 def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
                 "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
-              OpSize, Requires<[In32BitMode]>;
+              OpSize16, Requires<[Not64BitMode]>;
 def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
                 "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
-                      Requires<[In32BitMode]>;
-                
+              OpSize32, Requires<[Not64BitMode]>;
+
 def POPES16 : I<0x07, RawFrm, (outs), (ins),
                 "pop{w}\t{%es|es}", [], IIC_POP_SR>,
-              OpSize, Requires<[In32BitMode]>;
+              OpSize16, Requires<[Not64BitMode]>;
 def POPES32 : I<0x07, RawFrm, (outs), (ins),
                 "pop{l}\t{%es|es}", [], IIC_POP_SR>,
-                      Requires<[In32BitMode]>;
-                
+              OpSize32, Requires<[Not64BitMode]>;
+
 def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB;
+                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
 def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
 def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB;
-                
+                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
 def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB;
+                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
 def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
 def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB;
-                 
+                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
 def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
                 
 def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
 def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
 def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
 def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
                 
 def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
 def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
 def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
 def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
                 
 def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
                  "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
@@ -367,19 +391,23 @@ def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
 
 let SchedRW = [WriteSystem] in {
 def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
-def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdt\t$dst", [], IIC_SGDT>, TB;
+              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs opaque80mem:$dst), (ins),
+              "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
 def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
-              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
-def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
-              "sidt\t$dst", []>, TB;
+              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs opaque80mem:$dst), (ins),
+              "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
-                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize;
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
 def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
-                "sldt{l}\t$dst", [], IIC_SLDT>, TB;
+                "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
                 
 // LLDT is not interpreted specially in 64-bit mode because there is no sign
 //   extension.
@@ -389,13 +417,17 @@ def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
 def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
-def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdt\t$src", [], IIC_LGDT>, TB;
+              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
+              "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
 def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
-def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidt\t$src", [], IIC_LIDT>, TB;
+              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
+              "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
                 "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
 def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
@@ -410,9 +442,9 @@ def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
 def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
 
 def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
-                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB;
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
 def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
-                "smsw{l}\t$dst", [], IIC_SMSW>, TB;
+                "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
 // no m form encodable; use SMSW16m
 def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
                  "smsw{q}\t$dst", [], IIC_SMSW>, TB;
@@ -425,8 +457,13 @@ def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
                 "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
 def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
                 "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
-                
-def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+  def CPUID32 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB,
+  Requires<[Not64BitMode]>;
+let Defs = [RAX, RBX, RCX, RDX], Uses = [RAX, RCX] in
+  def CPUID64 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB,
+  Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -448,77 +485,77 @@ let Uses = [RDX, RAX, RCX] in
 let Uses = [RDX, RAX] in {
   def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
                "xsave\t$dst", []>, TB;
-  def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
-                 "xsave{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+                 "xsave{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
   def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
                "xrstor\t$dst", []>, TB;
-  def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
-                 "xrstor{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstor{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
   def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
                   "xsaveopt\t$dst", []>, TB;
-  def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
-                    "xsaveopt{q|64}\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                    "xsaveopt{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>;
 }
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // VIA PadLock crypto instructions
 let Defs = [RAX, RDI], Uses = [RDX, RDI] in
-  def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7;
+  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
 
 def : InstAlias<"xstorerng", (XSTORE)>;
 
 let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
-  def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7;
-  def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7;
-  def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7;
-  def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7;
-  def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7;
+  def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
+  def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
+  def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
+  def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
+  def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
 }
 
 let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
-  def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6;
-  def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6;
+  def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
+  def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
 }
 let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
-  def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6;
+  def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
 
 //===----------------------------------------------------------------------===//
 // FS/GS Base Instructions
 let Predicates = [HasFSGSBase, In64BitMode] in {
   def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
                    "rdfsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS;
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
   def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
                      "rdfsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS;
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
   def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
                    "rdgsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS;
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
   def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
                      "rdgsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS;
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
   def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
                    "wrfsbase{l}\t$src",
-                   [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS;
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
   def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
                       "wrfsbase{q}\t$src",
-                      [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS;
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
   def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
                    "wrgsbase{l}\t$src",
-                   [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS;
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
   def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
                       "wrgsbase{q}\t$src",
-                      [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS;
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
 }
 
 //===----------------------------------------------------------------------===//
 // INVPCID Instruction
 def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
-                Requires<[In32BitMode]>;
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
 def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invpcid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                 Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 59a6f1e..4940efc 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -40,7 +40,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
 
 // HLE prefixes
 
+let isAsmParserOnly = 1 in {
 def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
-
 def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+}
 
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 6d3548f..79afe9a 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -17,22 +17,22 @@
 
 // 66 0F 38 80
 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
-               Requires<[In32BitMode]>;
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[Not64BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                Requires<[In64BitMode]>;
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
-                Requires<[In32BitMode]>;
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
                 Requires<[In64BitMode]>;
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
 def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmclear\t$vmcs", []>, OpSize, TB;
+  "vmclear\t$vmcs", []>, PD;
 // OF 01 D4
 def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
 // 0F 01 C2
@@ -40,25 +40,25 @@ def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
 // 0F 01 C3
 def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
 def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmptrld\t$vmcs", []>, TB;
+  "vmptrld\t$vmcs", []>, PS;
 def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
   "vmptrst\t$vmcs", []>, TB;
 def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 // 0F 01 C4
 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
 def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 2b6ee5c..45e2ff0 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -14,10 +14,10 @@
 multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
 }
 
 defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
@@ -41,10 +41,10 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      Operand memop, ComplexPattern mem_cpat> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
 }
 
 defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
@@ -56,10 +56,10 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
 }
 
 defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
@@ -69,10 +69,10 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
+           [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
   def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
 }
 
 defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>;
@@ -82,19 +82,19 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
-           VEX_4V, VEX_W;
+           XOP_4V, VEX_W;
   def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
-             VEX_4VOp3;
+             XOP_4VOp3;
 }
 
 defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
@@ -114,12 +114,12 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, i8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX;
+           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins i128mem:$src1, i8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
+             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, XOP;
 }
 
 defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
@@ -134,14 +134,14 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM;
   def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
-              VR128:$src3))]>, VEX_4V, VEX_I8IMM;
+              VR128:$src3))]>, XOP_4V, VEX_I8IMM;
 }
 
 defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
@@ -164,14 +164,14 @@ multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
-           VEX_4V;
+           XOP_4V;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
-              imm:$src3))]>, VEX_4V;
+              imm:$src3))]>, XOP_4V;
 }
 
 defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
@@ -190,7 +190,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           XOP_4V, VEX_I8IMM;
   def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
            !strconcat(OpcodeStr,
@@ -198,7 +198,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR128:$dst,
              (Int VR128:$src1, VR128:$src2,
               (bitconvert (memopv2i64 addr:$src3))))]>,
-           VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
+           XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
   def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
@@ -206,7 +206,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR128:$dst,
              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
               VR128:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           XOP_4V, VEX_I8IMM;
 }
 
 defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
@@ -218,7 +218,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM, VEX_L;
+           XOP_4V, VEX_I8IMM, VEX_L;
   def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, i256mem:$src3),
            !strconcat(OpcodeStr,
@@ -226,7 +226,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst,
              (Int VR256:$src1, VR256:$src2,
               (bitconvert (memopv4i64 addr:$src3))))]>,
-           VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+           XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
   def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
@@ -234,7 +234,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst,
              (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
               VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM, VEX_L;
+           XOP_4V, VEX_I8IMM, VEX_L;
 }
 
 defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index f916327..4d279de 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -35,41 +35,41 @@ namespace llvm {
     /// overwriting OLD with a branch to NEW.  This is used for self-modifying
     /// code.
     ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
 
     /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
     /// to emit an indirect symbol which contains the address of the specified
     /// ptr.
-    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
-                                             JITCodeEmitter &JCE);
+    void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                     JITCodeEmitter &JCE) override;
 
     // getStubLayout - Returns the size and alignment of the largest call stub
     // on X86.
-    virtual StubLayout getStubLayout();
+    StubLayout getStubLayout() override;
 
     /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
     /// small native function that simply calls the function at the specified
     /// address.
-    virtual void *emitFunctionStub(const Function* F, void *Target,
-                                   JITCodeEmitter &JCE);
+    void *emitFunctionStub(const Function* F, void *Target,
+                           JITCodeEmitter &JCE) override;
 
     /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
     /// specific basic block.
-    virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+    uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase) override;
 
     /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
 
     /// relocate - Before the JIT can run a block of code that has been emitted,
     /// it must rewrite the code to contain the actual addresses of any
     /// referenced global symbols.
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase);
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char* GOTBase) override;
 
     /// allocateThreadLocalMemory - Each target has its own way of
     /// handling thread local variables. This method returns a value only
     /// meaningful to the target.
-    virtual char* allocateThreadLocalMemory(size_t size);
+    char* allocateThreadLocalMemory(size_t size) override;
 
     /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
     /// PIC jumptable entry.
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 6649c82..6d7f3cb 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -14,11 +14,14 @@
 
 #include "X86AsmPrinter.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
-#include "X86COFFMachineModuleInfo.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/IR/Type.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -26,8 +29,6 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/Mangler.h"
 using namespace llvm;
 
 namespace {
@@ -70,42 +71,53 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
+  const DataLayout *DL = TM.getDataLayout();
   assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
+  StringRef Suffix;
+
+  switch (MO.getTargetFlags()) {
+  case X86II::MO_DLLIMPORT:
+    // Handle dllimport linkage.
+    Name += "__imp_";
+    break;
+  case X86II::MO_DARWIN_STUB:
+    Suffix = "$stub";
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Suffix = "$non_lazy_ptr";
+    break;
+  }
+
+  if (!Suffix.empty())
+    Name += DL->getPrivateGlobalPrefix();
+
+  unsigned PrefixLen = Name.size();
 
   if (MO.isGlobal()) {
     const GlobalValue *GV = MO.getGlobal();
-    bool isImplicitlyPrivate = false;
-    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
-        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
-        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
-        MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
-      isImplicitlyPrivate = true;
-
-    getMang()->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+    AsmPrinter.getNameWithPrefix(Name, GV);
   } else if (MO.isSymbol()) {
-    Name += MAI.getGlobalPrefix();
-    Name += MO.getSymbolName();
+    getMang()->getNameWithPrefix(Name, MO.getSymbolName());
   } else if (MO.isMBB()) {
     Name += MO.getMBB()->getSymbol()->getName();
   }
+  unsigned OrigLen = Name.size() - PrefixLen;
+
+  Name += Suffix;
+  MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+
+  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
 
   // If the target flags on the operand changes the name of the symbol, do that
   // before we return the symbol.
   switch (MO.getTargetFlags()) {
   default: break;
-  case X86II::MO_DLLIMPORT: {
-    // Handle dllimport linkage.
-    const char *Prefix = "__imp_";
-    Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
-    break;
-  }
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
-    Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
-
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getGVStubEntry(Sym);
     if (StubSym.getPointer() == 0) {
@@ -115,11 +127,9 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
-    return Sym;
+    break;
   }
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
-    Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getHiddenGVStubEntry(Sym);
     if (StubSym.getPointer() == 0) {
@@ -129,11 +139,9 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
-    return Sym;
+    break;
   }
   case X86II::MO_DARWIN_STUB: {
-    Name += "$stub";
-    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getFnStubEntry(Sym);
     if (StubSym.getPointer())
@@ -145,16 +153,15 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     } else {
-      Name.erase(Name.end()-5, Name.end());
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
+        StubValueTy(Ctx.GetOrCreateSymbol(OrigName), false);
     }
-    return Sym;
+    break;
   }
   }
 
-  return Ctx.GetOrCreateSymbol(Name.str());
+  return Sym;
 }
 
 MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
@@ -227,13 +234,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 }
 
 
-/// LowerUnaryToTwoAddr - R = setb   -> R = sbb R, R
-static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
-  OutMI.setOpcode(NewOpc);
-  OutMI.addOperand(OutMI.getOperand(0));
-  OutMI.addOperand(OutMI.getOperand(0));
-}
-
 /// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
 /// a short fixed-register form.
 static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
@@ -297,12 +297,12 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
   unsigned RegOp = IsStore ? 0 : 5;
   unsigned AddrOp = AddrBase + 3;
   assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
-         Inst.getOperand(AddrBase + 0).isReg() && // base
-         Inst.getOperand(AddrBase + 1).isImm() && // scale
-         Inst.getOperand(AddrBase + 2).isReg() && // index register
-         (Inst.getOperand(AddrOp).isExpr() ||     // address
-          Inst.getOperand(AddrOp).isImm())&&
-         Inst.getOperand(AddrBase + 4).isReg() && // segment
+         Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+         Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+         (Inst.getOperand(AddrOp).isExpr() ||
+          Inst.getOperand(AddrOp).isImm()) &&
          "Unexpected instruction!");
 
   // Check whether the destination register can be fixed.
@@ -322,17 +322,23 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
   }
 
   if (Absolute &&
-      (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
-       Inst.getOperand(AddrBase + 2).getReg() != 0 ||
-       Inst.getOperand(AddrBase + 4).getReg() != 0 ||
-       Inst.getOperand(AddrBase + 1).getImm() != 1))
+      (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+       Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+       Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
     return;
 
   // If so, rewrite the instruction.
   MCOperand Saved = Inst.getOperand(AddrOp);
+  MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
   Inst = MCInst();
   Inst.setOpcode(Opcode);
   Inst.addOperand(Saved);
+  Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget)
+{
+	return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
 }
 
 void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
@@ -390,7 +396,6 @@ ReSimplify:
     assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
            "LEA has segment specified!");
     break;
-  case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
 
   case X86::MOV32ri64:
     OutMI.setOpcode(X86::MOV32ri);
@@ -464,7 +469,7 @@ ReSimplify:
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
     OutMI = MCInst();
-    OutMI.setOpcode(X86::RET);
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
     break;
   }
 
@@ -598,7 +603,8 @@ ReSimplify:
 
 static void LowerTlsAddr(MCStreamer &OutStreamer,
                          X86MCInstLower &MCInstLowering,
-                         const MachineInstr &MI) {
+                         const MachineInstr &MI,
+                         const MCSubtargetInfo& STI) {
 
   bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
                   MI.getOpcode() == X86::TLS_base_addr64;
@@ -608,7 +614,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
   MCContext &context = OutStreamer.getContext();
 
   if (needsPadding)
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
 
   MCSymbolRefExpr::VariantKind SRVK;
   switch (MI.getOpcode()) {
@@ -655,12 +661,12 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
     LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
     LEA.addOperand(MCOperand::CreateReg(0));        // seg
   }
-  OutStreamer.EmitInstruction(LEA);
+  OutStreamer.EmitInstruction(LEA, STI);
 
   if (needsPadding) {
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX), STI);
   }
 
   StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
@@ -672,114 +678,79 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
 
   OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
                                                      : X86::CALLpcrel32)
-    .addExpr(tlsRef));
-}
-
-static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
-parseMemoryOperand(StackMaps::Location::LocationType LocTy, unsigned Size,
-                   MachineInstr::const_mop_iterator MOI,
-                   MachineInstr::const_mop_iterator MOE) {
-
-  typedef StackMaps::Location Location;
-
-  assert(std::distance(MOI, MOE) >= 5 && "Too few operands to encode mem op.");
-
-  const MachineOperand &Base = *MOI;
-  const MachineOperand &Scale = *(++MOI);
-  const MachineOperand &Index = *(++MOI);
-  const MachineOperand &Disp = *(++MOI);
-  const MachineOperand &ZeroReg = *(++MOI);
-
-  // Sanity check for supported operand format.
-  assert(Base.isReg() &&
-         Scale.isImm() && Scale.getImm() == 1 &&
-         Index.isReg() && Index.getReg() == 0 &&
-         Disp.isImm() && ZeroReg.isReg() && (ZeroReg.getReg() == 0) &&
-         "Unsupported x86 memory operand sequence.");
-  (void)Scale;
-  (void)Index;
-  (void)ZeroReg;
-
-  return std::make_pair(
-    Location(LocTy, Size, Base.getReg(), Disp.getImm()), ++MOI);
+    .addExpr(tlsRef), STI);
 }
 
-std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
-X86AsmPrinter::stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
-                                     MachineInstr::const_mop_iterator MOE,
-                                     const TargetMachine &TM) {
-
-  typedef StackMaps::Location Location;
-
-  const MachineOperand &MOP = *MOI;
-  assert(!MOP.isRegMask() && (!MOP.isReg() || !MOP.isImplicit()) &&
-         "Register mask and implicit operands should not be processed.");
-
-  if (MOP.isImm()) {
-    // Verify anyregcc
-    // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
-
-    switch (MOP.getImm()) {
-    default: llvm_unreachable("Unrecognized operand type.");
-    case StackMaps::DirectMemRefOp: {
-      unsigned Size = TM.getDataLayout()->getPointerSizeInBits();
-      assert((Size % 8) == 0 && "Need pointer size in bytes.");
-      Size /= 8;
-      return parseMemoryOperand(StackMaps::Location::Direct, Size,
-                                llvm::next(MOI), MOE);
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) {
+  // This works only for 64bit. For 32bit we have to do additional checking if
+  // the CPU supports multi-byte nops.
+  assert(Is64Bit && "EmitNops only supports X86-64");
+  while (NumBytes) {
+    unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+    Opc = IndexReg = Displacement = SegmentReg = 0;
+    BaseReg = X86::RAX; ScaleVal = 1;
+    switch (NumBytes) {
+    case  0: llvm_unreachable("Zero nops?"); break;
+    case  1: NumBytes -=  1; Opc = X86::NOOP; break;
+    case  2: NumBytes -=  2; Opc = X86::XCHG16ar; break;
+    case  3: NumBytes -=  3; Opc = X86::NOOPL; break;
+    case  4: NumBytes -=  4; Opc = X86::NOOPL; Displacement = 8; break;
+    case  5: NumBytes -=  5; Opc = X86::NOOPL; Displacement = 8;
+             IndexReg = X86::RAX; break;
+    case  6: NumBytes -=  6; Opc = X86::NOOPW; Displacement = 8;
+             IndexReg = X86::RAX; break;
+    case  7: NumBytes -=  7; Opc = X86::NOOPL; Displacement = 512; break;
+    case  8: NumBytes -=  8; Opc = X86::NOOPL; Displacement = 512;
+             IndexReg = X86::RAX; break;
+    case  9: NumBytes -=  9; Opc = X86::NOOPW; Displacement = 512;
+             IndexReg = X86::RAX; break;
+    default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512;
+             IndexReg = X86::RAX; SegmentReg = X86::CS; break;
     }
-    case StackMaps::IndirectMemRefOp: {
-      ++MOI;
-      int64_t Size = MOI->getImm();
-      assert(Size > 0 && "Need a valid size for indirect memory locations.");
-      return parseMemoryOperand(StackMaps::Location::Indirect, Size,
-                                llvm::next(MOI), MOE);
-    }
-    case StackMaps::ConstantOp: {
-      ++MOI;
-      assert(MOI->isImm() && "Expected constant operand.");
-      int64_t Imm = MOI->getImm();
-      return std::make_pair(
-        Location(Location::Constant, sizeof(int64_t), 0, Imm), ++MOI);
-    }
-    }
-  }
 
-  // Otherwise this is a reg operand. The physical register number will
-  // ultimately be encoded as a DWARF regno. The stack map also records the size
-  // of a spill slot that can hold the register content. (The runtime can
-  // track the actual size of the data type if it needs to.)
-  assert(MOP.isReg() && "Expected register operand here.");
-  assert(TargetRegisterInfo::isPhysicalRegister(MOP.getReg()) &&
-         "Virtreg operands should have been rewritten before now.");
-  const TargetRegisterClass *RC =
-    TM.getRegisterInfo()->getMinimalPhysRegClass(MOP.getReg());
-  assert(!MOP.getSubReg() && "Physical subreg still around.");
-  return std::make_pair(
-    Location(Location::Register, RC->getSize(), MOP.getReg(), 0), ++MOI);
+    unsigned NumPrefixes = std::min(NumBytes, 5U);
+    NumBytes -= NumPrefixes;
+    for (unsigned i = 0; i != NumPrefixes; ++i)
+      OS.EmitBytes("\x66");
+
+    switch (Opc) {
+    default: llvm_unreachable("Unexpected opcode"); break;
+    case X86::NOOP:
+      OS.EmitInstruction(MCInstBuilder(Opc), STI);
+      break;
+    case X86::XCHG16ar:
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+      break;
+    case X86::NOOPL:
+    case X86::NOOPW:
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal)
+                                           .addReg(IndexReg)
+                                           .addImm(Displacement)
+                                           .addReg(SegmentReg), STI);
+      break;
+    }
+  } // while (NumBytes)
 }
 
 // Lower a stackmap of the form:
 // <id>, <shadowBytes>, ...
-static void LowerSTACKMAP(MCStreamer &OutStreamer,
-                          StackMaps &SM,
-                          const MachineInstr &MI)
-{
-  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM,
+                          const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) {
+  unsigned NumBytes = MI.getOperand(1).getImm();
   SM.recordStackMap(MI);
   // Emit padding.
   // FIXME: These nops ensure that the stackmap's shadow is covered by
   // instructions from the same basic block, but the nops should not be
   // necessary if instructions from the same block follow the stackmap.
-  for (unsigned i = 0; i < NumNOPBytes; ++i)
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+  EmitNops(OS, NumBytes, Is64Bit, STI);
 }
 
 // Lower a patchpoint of the form:
 // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
-static void LowerPATCHPOINT(MCStreamer &OutStreamer,
-                            StackMaps &SM,
-                            const MachineInstr &MI) {
+static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM,
+                            const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) {
+  assert(Is64Bit && "Patchpoint currently only supports X86-64");
   SM.recordPatchPoint(MI);
 
   PatchPointOpers opers(&MI);
@@ -789,22 +760,21 @@ static void LowerPATCHPOINT(MCStreamer &OutStreamer,
   if (CallTarget) {
     // Emit MOV to materialize the target address and the CALL to target.
     // This is encoded with 12-13 bytes, depending on which register is used.
-    // We conservatively assume that it is 12 bytes and emit in worst case one
-    // extra NOP byte.
-    EncodedBytes = 12;
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri)
-                                .addReg(MI.getOperand(ScratchIdx).getReg())
-                                .addImm(CallTarget));
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r)
-                                .addReg(MI.getOperand(ScratchIdx).getReg()));
+    unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+    if (X86II::isX86_64ExtendedReg(ScratchReg))
+      EncodedBytes = 13;
+    else
+      EncodedBytes = 12;
+    OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg)
+                                                  .addImm(CallTarget), STI);
+    OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg), STI);
   }
   // Emit padding.
   unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   assert(NumBytes >= EncodedBytes &&
          "Patchpoint can't request size less than the length of a call.");
 
-  for (unsigned i = EncodedBytes; i < NumBytes; ++i)
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+  EmitNops(OS, NumBytes - EncodedBytes, Is64Bit, STI);
 }
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -815,8 +785,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   // Emit nothing here but a comment if we can.
   case X86::Int_MemBarrier:
-    if (OutStreamer.hasRawTextSupport())
-      OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER"));
+    OutStreamer.emitRawComment("MEMBARRIER");
     return;
 
 
@@ -839,7 +808,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::TLS_addr64:
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
-    return LowerTlsAddr(OutStreamer, MCInstLowering, *MI);
+    return LowerTlsAddr(OutStreamer, MCInstLowering, *MI, getSubtargetInfo());
 
   case X86::MOVPC32r: {
     // This is a pseudo op for a two instruction sequence with a label, which
@@ -852,14 +821,14 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *PICBase = MF->getPICBaseSymbol();
     // FIXME: We would like an efficient form for this, so we don't have to do a
     // lot of extra uniquing.
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::CALLpcrel32)
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::CALLpcrel32)
       .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
 
     // Emit the label.
     OutStreamer.EmitLabel(PICBase);
 
     // popl $reg
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::POP32r)
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::POP32r)
       .addReg(MI->getOperand(0).getReg()));
     return;
   }
@@ -890,7 +859,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
                                       DotExpr, OutContext);
 
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::ADD32ri)
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::ADD32ri)
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
       .addExpr(DotExpr));
@@ -898,19 +867,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(OutStreamer, SM, *MI);
+    return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo());
 
   case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+    return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo());
 
   case X86::MORESTACK_RET:
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget)));
     return;
 
   case X86::MORESTACK_RET_RESTORE_R10:
     // Return, then restore R10.
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64rr)
+    EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitToStreamer(OutStreamer, MCInstBuilder(X86::MOV64rr)
       .addReg(X86::R10)
       .addReg(X86::RAX));
     return;
@@ -918,5 +887,5 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
-  OutStreamer.EmitInstruction(TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 83e75ea..746d0d6 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -51,9 +51,9 @@ namespace {
     PadShortFunc() : MachineFunctionPass(ID)
                    , Threshold(4), TM(0), TII(0) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "X86 Atom pad short functions";
     }
 
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index dbda556..85aa9b5 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86RegisterInfo.h"
-#include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
@@ -27,7 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
@@ -234,19 +233,25 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 
 const uint16_t *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+
+  assert(MF && "MachineFunction required");
   switch (MF->getFunction()->getCallingConv()) {
   case CallingConv::GHC:
   case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
-
-  case CallingConv::WebKit_JS:
-    return CSR_64_SaveList;
   case CallingConv::AnyReg:
-    return CSR_MostRegs_64_SaveList;
-
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_SaveList;
+    return CSR_64_AllRegs_SaveList;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_SaveList;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_SaveList;
+    return CSR_64_RT_AllRegs_SaveList;
   case CallingConv::Intel_OCL_BI: {
-    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-    bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
     if (HasAVX512 && Is64Bit)
@@ -259,12 +264,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
       return CSR_64_Intel_OCL_BI_SaveList;
     break;
   }
-
   case CallingConv::Cold:
     if (Is64Bit)
-      return CSR_MostRegs_64_SaveList;
+      return CSR_64_MostRegs_SaveList;
     break;
-
   default:
     break;
   }
@@ -287,29 +290,49 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
 
-  if (CC == CallingConv::Intel_OCL_BI) {
-    if (IsWin64 && HasAVX512)
+  switch (CC) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_RegMask;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_RegMask;
+    return CSR_64_AllRegs_RegMask;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_RegMask;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_RegMask;
+    return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
-    if (Is64Bit && HasAVX512)
+    if (HasAVX512 && Is64Bit)
       return CSR_64_Intel_OCL_BI_AVX512_RegMask;
-    if (IsWin64 && HasAVX)
+    if (HasAVX && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
-    if (Is64Bit && HasAVX)
+    if (HasAVX && Is64Bit)
       return CSR_64_Intel_OCL_BI_AVX_RegMask;
     if (!HasAVX && !IsWin64 && Is64Bit)
       return CSR_64_Intel_OCL_BI_RegMask;
+    break;
   }
-  if (CC == CallingConv::GHC || CC == CallingConv::HiPE)
-    return CSR_NoRegs_RegMask;
-  if (CC == CallingConv::WebKit_JS || CC == CallingConv::AnyReg)
-    return CSR_MostRegs_64_RegMask;
-  if (!Is64Bit)
-    return CSR_32_RegMask;
-  if (CC == CallingConv::Cold)
-    return CSR_MostRegs_64_RegMask;
-  if (IsWin64)
-    return CSR_Win64_RegMask;
-  return CSR_64_RegMask;
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_RegMask;
+    break;
+  default:
+    break;
+  }
+
+  // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+  // callsEHReturn().
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_RegMask;
+    return CSR_64_RegMask;
+  }
+  return CSR_32_RegMask;
 }
 
 const uint32_t*
@@ -403,18 +426,15 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    if (!EnableBasePointer)
      return false;
 
-   // When we need stack realignment and there are dynamic allocas, we can't
-   // reference off of the stack pointer, so we reserve a base pointer.
-   //
-   // This is also true if the function contain MS-style inline assembly.  We
-   // do this because if any stack changes occur in the inline assembly, e.g.,
-   // "pusha", then any C local variable or C argument references in the
-   // inline assembly will be wrong because the SP is not properly tracked.
-   if ((needsStackRealignment(MF) && MFI->hasVarSizedObjects()) ||
-       MF.hasMSInlineAsm())
-     return true;
-
-   return false;
+   // When we need stack realignment, we can't address the stack from the frame
+   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+   // can't address variables from the stack pointer.  MS inline asm can
+   // reference locals while also adjusting the stack pointer.  When we can't
+   // use both the SP and the FP, we need a separate base pointer register.
+   bool CantUseFP = needsStackRealignment(MF);
+   bool CantUseSP =
+       MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust();
+   return CantUseFP && CantUseSP;
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
@@ -499,6 +519,15 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else
     FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
 
+  // The frame index format for stackmaps and patchpoints is different from the
+  // X86 format. It only has a FI and an offset.
+  if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+    assert(BasePtr == FramePtr && "Expected the FP as base register");
+    int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
   if (MI.getOperand(FIOperandNum+3).isImm()) {
     // Offset is a 32-bit integer.
     int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 22251b2..6a71113 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -24,7 +24,7 @@ namespace llvm {
   class TargetInstrInfo;
   class X86TargetMachine;
 
-class X86RegisterInfo : public X86GenRegisterInfo {
+class X86RegisterInfo final : public X86GenRegisterInfo {
 public:
   X86TargetMachine &TM;
 
@@ -62,66 +62,70 @@ public:
 
   /// getCompactUnwindRegNum - This function maps the register to the number for
   /// compact unwind encoding. Return -1 if the register isn't valid.
-  int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const;
+  int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const override;
 
   /// Code Generation virtual methods...
   ///
-  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
   /// getMatchingSuperRegClass - Return a subclass of the specified register
   /// class A so that each register in it has a sub-register of the
   /// specified sub-register index which is in the specified register class B.
-  virtual const TargetRegisterClass *
+  const TargetRegisterClass *
   getMatchingSuperRegClass(const TargetRegisterClass *A,
-                           const TargetRegisterClass *B, unsigned Idx) const;
+                           const TargetRegisterClass *B,
+                           unsigned Idx) const override;
 
-  virtual const TargetRegisterClass *
-  getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const;
+  const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC,
+                        unsigned Idx) const override;
 
   const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
 
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
   const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 
   /// getCrossCopyRegClass - Returns a legal register class to copy a register
   /// in the specified class to or from. Returns NULL if it is possible to copy
   /// between a two registers of the specified class.
   const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const;
+                               MachineFunction &MF) const override;
 
   /// getCalleeSavedRegs - Return a null-terminated list of all of the
   /// callee-save registers on this target.
-  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction* MF) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const;
 
   /// getReservedRegs - Returns a bitset indexed by physical register number
   /// indicating if a register is a special register that has particular uses and
   /// should be considered unavailable at all times, e.g. SP, RA. This is used by
   /// register scavenger to determine what registers are free.
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
 
   bool canRealignStack(const MachineFunction &MF) const;
 
-  bool needsStackRealignment(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
 
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
-                            int &FrameIdx) const;
+                            int &FrameIdx) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = NULL) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
   unsigned getStackRegister() const { return StackPtr; }
   unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index b802728..33c402b 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -463,9 +463,13 @@ def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
 def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
                           256, (sequence "YMM%u", 0, 31)>;
 
-def VK8     : RegisterClass<"X86", [v8i1],   8, (sequence "K%u", 0, 7)>;
-def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)>;
-
-def VK8WM   : RegisterClass<"X86", [v8i1],   8, (sub VK8, K0)>;
+// The size of the all masked registers is 16 bit because we have only one
+// KMOVW istruction that can store this register in memory, and it writes 2 bytes
+def VK1     : RegisterClass<"X86", [i1],    16, (sequence "K%u", 0, 7)>;
+def VK8     : RegisterClass<"X86", [v8i1],  16, (add VK1)> {let Size = 16;}
+def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+
+def VK1WM   : RegisterClass<"X86", [i1],    16, (sub VK1, K0)> {let Size = 16;}
+def VK8WM   : RegisterClass<"X86", [v8i1],  16, (sub VK8, K0)> {let Size = 16;}
 def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>;
 
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 9748261..f5b51ee 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -29,7 +29,7 @@ let SchedModel = HaswellModel in {
 
 // Haswell can issue micro-ops to 8 different ports in one cycle.
 
-// Ports 0, 1, 5, 6 and 7 handle all computation.
+// Ports 0, 1, 5, and 6 handle all computation.
 // Port 4 gets the data half of stores. Store data can be available later than
 // the store address, but since we don't model the latency of stores, we can
 // ignore that.
@@ -48,8 +48,9 @@ def HWPort7 : ProcResource<1>;
 def HWPort23  : ProcResGroup<[HWPort2, HWPort3]>;
 def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
 def HWPort05  : ProcResGroup<[HWPort0, HWPort5]>;
-def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>;
 def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16  : ProcResGroup<[HWPort1, HWPort6]>;
 def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
 def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
 
@@ -88,6 +89,8 @@ multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
 // need an extra port 2/3 cycle to recompute the address.
 def : WriteRes<WriteRMW, [HWPort4]>;
 
+// Store_addr on 237.
+// Store_data on 4.
 def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
 def : WriteRes<WriteLoad,  [HWPort23]> { let Latency = 4; }
 def : WriteRes<WriteMove,  [HWPort0156]>;
@@ -96,8 +99,8 @@ def : WriteRes<WriteZero,  []>;
 defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
 defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
 def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : HWWriteResPair<WriteShift, HWPort056,  1>;
-defm : HWWriteResPair<WriteJump,  HWPort5,   1>;
+defm : HWWriteResPair<WriteShift, HWPort06,  1>;
+defm : HWWriteResPair<WriteJump,  HWPort06,   1>;
 
 // This is for simple LEAs with one or two input operands.
 // The complex ones can only execute on port 1, and they require two cycles on
@@ -123,14 +126,136 @@ defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
 defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
 defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
 defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteFBlend,  HWPort015,  1>;
+defm : HWWriteResPair<WriteFShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteFVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
 
 // Vector integer operations.
-defm : HWWriteResPair<WriteVecShift, HWPort05,  1>;
+defm : HWWriteResPair<WriteVecShift, HWPort0,  1>;
 defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
 defm : HWWriteResPair<WriteVecALU,   HWPort15,  1>;
 defm : HWWriteResPair<WriteVecIMul,  HWPort0,   5>;
-defm : HWWriteResPair<WriteShuffle,  HWPort15,  1>;
+defm : HWWriteResPair<WriteShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteBlend,  HWPort15,  1>;
+defm : HWWriteResPair<WriteShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 2];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+  let Latency = 10;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [6, 2, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
+  let Latency = 11;
+  let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+  let Latency = 14;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1, 1];
+}
 
 def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
 } // SchedModel
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 3011c6d..a58859a 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -118,6 +118,16 @@ defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
 defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
 defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
 defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+defm : SBWriteResPair<WriteFShuffle,  SBPort5,  1>;
+defm : SBWriteResPair<WriteFBlend,  SBPort05,  1>;
+def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
 
 // Vector integer operations.
 defm : SBWriteResPair<WriteVecShift, SBPort05,  1>;
@@ -125,7 +135,112 @@ defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
 defm : SBWriteResPair<WriteVecALU,   SBPort15,  1>;
 defm : SBWriteResPair<WriteVecIMul,  SBPort0,   5>;
 defm : SBWriteResPair<WriteShuffle,  SBPort15,  1>;
+defm : SBWriteResPair<WriteBlend,  SBPort15,  1>;
+def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort015]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
+  let Latency = 3;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+  let Latency = 4;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+  let Latency = 4;
+  let ResourceCycles = [7, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+  let Latency = 14;
+  let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [17, 1];
+}
+
 
 def : WriteRes<WriteSystem,     [SBPort015]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2 is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteVarVecShift, SBPort0,  1>;
 } // SchedModel
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 0556437..25c5a6b 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -69,6 +69,9 @@ defm WriteFDiv  : X86SchedWritePair; // Floating point division.
 defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
 defm WriteFRcp  : X86SchedWritePair; // Floating point reciprocal.
 defm WriteFMA   : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
+defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
 
 // FMA Scheduling helper class.
 class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
@@ -77,23 +80,55 @@ class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
 defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
 defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+defm WriteShuffle  : X86SchedWritePair; // Vector shuffles.
+defm WriteBlend  : X86SchedWritePair; // Vector blends.
+defm WriteVarBlend  : X86SchedWritePair; // Vector variable blends.
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
 
 // Vector bitwise operations.
 // These are often used on both floating point and integer vectors.
 defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
-defm WriteShuffle  : X86SchedWritePair; // Vector shuffles and blends.
 
 // Conversion between integer and float.
 defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
 defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
 defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
 
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair;
+
 // Catch-all for expensive system instructions.
 def WriteSystem : SchedWrite;
 
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+
 // Old microcoded instructions that nobody use.
 def WriteMicrocoded : SchedWrite;
 
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for X86
 def IIC_ALU_MEM     : InstrItinClass;
@@ -577,7 +612,7 @@ def IIC_NOP : InstrItinClass;
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
-// IssueWidth is analagous to the number of decode units. Core and its
+// IssueWidth is analogous to the number of decode units. Core and its
 // descendents, including Nehalem and SandyBridge have 4 decoders.
 // Resources beyond the decoder operate on micro-ops and are bufferred
 // so adjacent micro-ops don't directly compete.
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index d728af5..0d5dc38 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -33,22 +33,20 @@ public:
   explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
   ~X86SelectionDAGInfo();
 
-  virtual
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const;
+                                  MachinePointerInfo DstPtrInfo) const override;
 
-  virtual
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const;
+                                  MachinePointerInfo SrcPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 01353b2..207d0ba 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -55,7 +55,7 @@ unsigned char X86Subtarget::
 ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
   // DLLImport only exists on windows, it is implemented as a load from a
   // DLLIMPORT stub.
-  if (GV->hasDLLImportLinkage())
+  if (GV->hasDLLImportStorageClass())
     return X86II::MO_DLLIMPORT;
 
   // Determine whether this is a reference to a definition or a declaration.
@@ -165,7 +165,10 @@ bool X86Subtarget::hasSinCos() const {
 /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
 /// to immediate address.
 bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
-  if (In64BitMode)
+  // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+  // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
+  // the following check for Win32 should be removed.
+  if (In64BitMode || isTargetWin32())
     return false;
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
@@ -263,6 +266,15 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
       ToggleFeature(X86::FeatureSlowBTMem);
     }
 
+    // Determine if SHLD/SHRD instructions have higher latency then the
+    // equivalent series of shifts/or instructions. 
+    // FIXME: Add Intel's processors that have SHLD instructions with very
+    // poor latency. 
+    if (IsAMD) {
+      IsSHLDSlow = true;
+      ToggleFeature(X86::FeatureSlowSHLD);
+    }
+
     // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
     // memory access is fast. We hard code model numbers here because they
     // aren't strictly increasing for Intel chips it seems.
@@ -473,6 +485,12 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   // target data structure which is shared with MC code emitter, etc.
   if (In64BitMode)
     ToggleFeature(X86::Mode64Bit);
+  else if (In32BitMode)
+    ToggleFeature(X86::Mode32Bit);
+  else if (In16BitMode)
+    ToggleFeature(X86::Mode16Bit);
+  else
+    llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
 
   DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
                << ", 3DNowLevel " << X863DNowLevel
@@ -519,6 +537,7 @@ void X86Subtarget::initializeEnvironment() {
   HasPRFCHW = false;
   HasRDSEED = false;
   IsBTMemSlow = false;
+  IsSHLDSlow = false;
   IsUAMemFast = false;
   HasVectorUAMem = false;
   HasCmpxchg16b = false;
@@ -535,13 +554,17 @@ void X86Subtarget::initializeEnvironment() {
 
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS,
-                           unsigned StackAlignOverride, bool is64Bit)
+                           unsigned StackAlignOverride)
   : X86GenSubtargetInfo(TT, CPU, FS)
   , X86ProcFamily(Others)
   , PICStyle(PICStyles::None)
   , TargetTriple(TT)
   , StackAlignOverride(StackAlignOverride)
-  , In64BitMode(is64Bit) {
+  , In64BitMode(TargetTriple.getArch() == Triple::x86_64)
+  , In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+                TargetTriple.getEnvironment() != Triple::CODE16)
+  , In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+                TargetTriple.getEnvironment() == Triple::CODE16) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
 }
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index dd8c081..52986b9 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -39,7 +39,7 @@ enum Style {
 };
 }
 
-class X86Subtarget : public X86GenSubtargetInfo {
+class X86Subtarget final : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
     NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -142,6 +142,9 @@ protected:
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
+  /// IsSHLDSlow - True if SHLD instructions are slow.
+  bool IsSHLDSlow;
+
   /// IsUAMemFast - True if unaligned memory access is fast.
   bool IsUAMemFast;
 
@@ -202,16 +205,22 @@ private:
   /// StackAlignOverride - Override the stack alignment.
   unsigned StackAlignOverride;
 
-  /// In64BitMode - True if compiling for 64-bit, false for 32-bit.
+  /// In64BitMode - True if compiling for 64-bit, false for 16-bit or 32-bit.
   bool In64BitMode;
 
+  /// In32BitMode - True if compiling for 32-bit, false for 16-bit or 64-bit.
+  bool In32BitMode;
+
+  /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
+  bool In16BitMode;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   X86Subtarget(const std::string &TT, const std::string &CPU,
                const std::string &FS,
-               unsigned StackAlignOverride, bool is64Bit);
+               unsigned StackAlignOverride);
 
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
@@ -231,7 +240,7 @@ public:
   void AutoDetectSubtargetFeatures();
 
   /// \brief Reset the features for the X86 target.
-  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -241,9 +250,18 @@ public:
     return In64BitMode;
   }
 
+  bool is32Bit() const {
+    return In32BitMode;
+  }
+
+  bool is16Bit() const {
+    return In16BitMode;
+  }
+
   /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
   bool isTarget64BitILP32() const {
-    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32);
+    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+                           TargetTriple.getOS() == Triple::NaCl);
   }
 
   /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
@@ -292,6 +310,7 @@ public:
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
@@ -315,23 +334,33 @@ public:
   bool isTargetSolaris() const {
     return TargetTriple.getOS() == Triple::Solaris;
   }
-  bool isTargetELF() const {
-    return (TargetTriple.getEnvironment() == Triple::ELF ||
-            TargetTriple.isOSBinFormatELF());
-  }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetMacho() const { return TargetTriple.isOSBinFormatMachO(); }
+
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
-  bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
-  bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
-  bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
-  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
-  bool isTargetCOFF() const {
-    return (TargetTriple.getEnvironment() != Triple::ELF &&
-            TargetTriple.isOSBinFormatCOFF());
+
+  bool isTargetWindowsMSVC() const {
+    return TargetTriple.isWindowsMSVCEnvironment();
+  }
+
+  bool isTargetKnownWindowsMSVC() const {
+    return TargetTriple.isKnownWindowsMSVCEnvironment();
+  }
+
+  bool isTargetWindowsCygwin() const {
+    return TargetTriple.isWindowsCygwinEnvironment();
+  }
+
+  bool isTargetWindowsGNU() const {
+    return TargetTriple.isWindowsGNUEnvironment();
   }
-  bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
+
+  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
 
   bool isOSWindows() const { return TargetTriple.isOSWindows(); }
 
@@ -340,7 +369,7 @@ public:
   }
 
   bool isTargetWin32() const {
-    return !In64BitMode && (isTargetCygMing() || isTargetWindows());
+    return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
   }
 
   bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
@@ -391,12 +420,12 @@ public:
   bool hasSinCos() const;
 
   /// Enable the MachineScheduler pass for all X86 subtargets.
-  bool enableMachineScheduler() const LLVM_OVERRIDE { return true; }
+  bool enableMachineScheduler() const override { return true; }
 
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const;
+                             RegClassVector& CriticalPathRCs) const override;
 
   bool postRAScheduler() const { return PostRAScheduler; }
 
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index ddf580f..6f09ccf 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -13,7 +13,6 @@
 
 #include "X86TargetMachine.h"
 #include "X86.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -24,53 +23,49 @@ using namespace llvm;
 
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
-  RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
-  RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
+  RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
+  RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
 }
 
-void X86_32TargetMachine::anchor() { }
-
-X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
-                                         StringRef CPU, StringRef FS,
-                                         const TargetOptions &Options,
-                                         Reloc::Model RM, CodeModel::Model CM,
-                                         CodeGenOpt::Level OL)
-  : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false),
-    DL(getSubtargetImpl()->isTargetDarwin() ?
-               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
-               "n8:16:32-S128" :
-               (getSubtargetImpl()->isTargetCygMing() ||
-                getSubtargetImpl()->isTargetWindows()) ?
-               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-"
-               "n8:16:32-S32" :
-               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
-               "n8:16:32-S128"),
-    InstrInfo(*this),
-    TLInfo(*this),
-    TSInfo(*this),
-    JITInfo(*this) {
-  initAsmInfo();
-}
-
-void X86_64TargetMachine::anchor() { }
-
-X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
-                                         StringRef CPU, StringRef FS,
-                                         const TargetOptions &Options,
-                                         Reloc::Model RM, CodeModel::Model CM,
-                                         CodeGenOpt::Level OL)
-  : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
-    // The x32 ABI dictates the ILP32 programming model for x64.
-    DL(getSubtargetImpl()->isTarget64BitILP32() ?
-        "e-p:32:32-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
-        "n8:16:32:64-S128" :
-        "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
-        "n8:16:32:64-S128"),
-    InstrInfo(*this),
-    TLInfo(*this),
-    TSInfo(*this),
-    JITInfo(*this) {
-  initAsmInfo();
+void X86TargetMachine::anchor() { }
+
+static std::string computeDataLayout(const X86Subtarget &ST) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+  // X86 and x32 have 32 bit pointers.
+  if (ST.isTarget64BitILP32() || !ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (ST.is64Bit() || ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC() ||
+      ST.isTargetNaCl())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (ST.isTargetNaCl())
+    ; // No f80
+  else if (ST.is64Bit() || ST.isTargetDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (ST.is64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if (!ST.is64Bit() && (ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC()))
+    Ret += "-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
@@ -79,12 +74,16 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL,
-                                   bool is64Bit)
+                                   CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
+    Subtarget(TT, CPU, FS, Options.StackAlignmentOverride),
     FrameLowering(*this, Subtarget),
-    InstrItins(Subtarget.getInstrItineraryData()){
+    InstrItins(Subtarget.getInstrItineraryData()),
+    DL(computeDataLayout(*getSubtargetImpl())),
+    InstrInfo(*this),
+    TLInfo(*this),
+    TSInfo(*this),
+    JITInfo(*this) {
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -108,6 +107,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   // default to hard float ABI
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType = FloatABI::Hard;
+
+  initAsmInfo();
 }
 
 //===----------------------------------------------------------------------===//
@@ -156,11 +157,11 @@ public:
     return *getX86TargetMachine().getSubtargetImpl();
   }
 
-  virtual bool addInstSelector();
-  virtual bool addILPOpts();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 174d391..57e6eda 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -14,7 +14,6 @@
 #ifndef X86TARGETMACHINE_H
 #define X86TARGETMACHINE_H
 
-#include "X86.h"
 #include "X86FrameLowering.h"
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
@@ -22,114 +21,60 @@
 #include "X86SelectionDAGInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 class StringRef;
 
-class X86TargetMachine : public LLVMTargetMachine {
+class X86TargetMachine final : public LLVMTargetMachine {
+  virtual void anchor();
   X86Subtarget       Subtarget;
   X86FrameLowering   FrameLowering;
   InstrItineraryData InstrItins;
+  const DataLayout   DL; // Calculates type size & alignment
+  X86InstrInfo       InstrInfo;
+  X86TargetLowering  TLInfo;
+  X86SelectionDAGInfo TSInfo;
+  X86JITInfo         JITInfo;
 
 public:
   X86TargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL,
-                   bool is64Bit);
+                   CodeGenOpt::Level OL);
 
-  virtual const X86InstrInfo     *getInstrInfo() const {
-    llvm_unreachable("getInstrInfo not implemented");
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const X86InstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
   }
-  virtual const TargetFrameLowering  *getFrameLowering() const {
+  const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual       X86JITInfo       *getJITInfo()         {
-    llvm_unreachable("getJITInfo not implemented");
+  X86JITInfo *getJITInfo() override {
+    return &JITInfo;
   }
-  virtual const X86Subtarget     *getSubtargetImpl() const{ return &Subtarget; }
-  virtual const X86TargetLowering *getTargetLowering() const {
-    llvm_unreachable("getTargetLowering not implemented");
+  const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const X86TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
   }
-  virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
-    llvm_unreachable("getSelectionDAGInfo not implemented");
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
   }
-  virtual const X86RegisterInfo  *getRegisterInfo() const {
+  const X86RegisterInfo  *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
 
   /// \brief Register X86 analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 
   // Set up the pass pipeline.
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-
-  virtual bool addCodeEmitter(PassManagerBase &PM,
-                              JITCodeEmitter &JCE);
-};
-
-/// X86_32TargetMachine - X86 32-bit target machine.
-///
-class X86_32TargetMachine : public X86TargetMachine {
-  virtual void anchor();
-  const DataLayout  DL; // Calculates type size & alignment
-  X86InstrInfo      InstrInfo;
-  X86TargetLowering TLInfo;
-  X86SelectionDAGInfo TSInfo;
-  X86JITInfo        JITInfo;
-public:
-  X86_32TargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL);
-  virtual const DataLayout *getDataLayout() const { return &DL; }
-  virtual const X86TargetLowering *getTargetLowering() const {
-    return &TLInfo;
-  }
-  virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
-    return &TSInfo;
-  }
-  virtual const X86InstrInfo     *getInstrInfo() const {
-    return &InstrInfo;
-  }
-  virtual       X86JITInfo       *getJITInfo()         {
-    return &JITInfo;
-  }
-};
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-/// X86_64TargetMachine - X86 64-bit target machine.
-///
-class X86_64TargetMachine : public X86TargetMachine {
-  virtual void anchor();
-  const DataLayout  DL; // Calculates type size & alignment
-  X86InstrInfo      InstrInfo;
-  X86TargetLowering TLInfo;
-  X86SelectionDAGInfo TSInfo;
-  X86JITInfo        JITInfo;
-public:
-  X86_64TargetMachine(const Target &T, StringRef TT,
-                      StringRef CPU, StringRef FS, const TargetOptions &Options,
-                      Reloc::Model RM, CodeModel::Model CM,
-                      CodeGenOpt::Level OL);
-  virtual const DataLayout *getDataLayout() const { return &DL; }
-  virtual const X86TargetLowering *getTargetLowering() const {
-    return &TLInfo;
-  }
-  virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
-    return &TSInfo;
-  }
-  virtual const X86InstrInfo     *getInstrInfo() const {
-    return &InstrInfo;
-  }
-  virtual       X86JITInfo       *getJITInfo()         {
-    return &JITInfo;
-  }
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 086cd4d..0a88e98 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -8,38 +8,40 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86TargetObjectFile.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLowering.h"
 
 using namespace llvm;
 using namespace dwarf;
 
-const MCExpr *X86_64MachoTargetObjectFile::
-getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                        MachineModuleInfo *MMI, unsigned Encoding,
-                        MCStreamer &Streamer) const {
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
 
   // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
   // is an indirect pc-relative reference.
   if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = getSymbol(*Mang, GV);
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
     const MCExpr *Res =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
     const MCExpr *Four = MCConstantExpr::Create(4, getContext());
     return MCBinaryExpr::CreateAdd(Res, Four, getContext());
   }
 
-  return TargetLoweringObjectFileMachO::
-    getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer);
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
 }
 
-MCSymbol *X86_64MachoTargetObjectFile::
-getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
-                        MachineModuleInfo *MMI) const {
-  return getSymbol(*Mang, GV);
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
 }
 
 void
@@ -53,3 +55,54 @@ X86LinuxTargetObjectFile::getDebugThreadLocalSymbol(
     const MCSymbol *Sym) const {
   return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
 }
+
+const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
+    const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const {
+  // We are looking for the difference of two symbols, need a subtraction
+  // operation.
+  const SubOperator *Sub = dyn_cast<SubOperator>(CE);
+  if (!Sub)
+    return 0;
+
+  // Symbols must first be numbers before we can subtract them, we need to see a
+  // ptrtoint on both subtraction operands.
+  const PtrToIntOperator *SubLHS =
+      dyn_cast<PtrToIntOperator>(Sub->getOperand(0));
+  const PtrToIntOperator *SubRHS =
+      dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
+  if (!SubLHS || !SubRHS)
+    return 0;
+
+  // Our symbols should exist in address space zero, cowardly no-op if
+  // otherwise.
+  if (SubLHS->getPointerAddressSpace() != 0 ||
+      SubRHS->getPointerAddressSpace() != 0)
+    return 0;
+
+  // Both ptrtoint instructions must wrap global variables:
+  // - Only global variables are eligible for image relative relocations.
+  // - The subtrahend refers to the special symbol __ImageBase, a global.
+  const GlobalVariable *GVLHS =
+      dyn_cast<GlobalVariable>(SubLHS->getPointerOperand());
+  const GlobalVariable *GVRHS =
+      dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
+  if (!GVLHS || !GVRHS)
+    return 0;
+
+  // We expect __ImageBase to be a global variable without a section, externally
+  // defined.
+  //
+  // It should look something like this: @__ImageBase = external constant i8
+  if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
+      !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
+      GVRHS->hasSection())
+    return 0;
+
+  // An image-relative, thread-local, symbol makes no sense.
+  if (GVLHS->isThreadLocal())
+    return 0;
+
+  return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang),
+                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 getContext());
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 79c861d..a08ed09 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -12,7 +12,6 @@
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -20,25 +19,33 @@ namespace llvm {
   /// x86-64.
   class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
   public:
-    virtual const MCExpr *
-    getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
-                            MachineModuleInfo *MMI, unsigned Encoding,
-                            MCStreamer &Streamer) const;
+    const MCExpr *
+    getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                            Mangler &Mang, const TargetMachine &TM,
+                            MachineModuleInfo *MMI,
+                            MCStreamer &Streamer) const override;
 
     // getCFIPersonalitySymbol - The symbol that gets passed to
     // .cfi_personality.
-    virtual MCSymbol *
-    getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
-                            MachineModuleInfo *MMI) const;
+    MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                      const TargetMachine &TM,
+                                      MachineModuleInfo *MMI) const override;
   };
 
   /// X86LinuxTargetObjectFile - This implementation is used for linux x86
   /// and x86-64.
   class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
     /// \brief Describe a TLS variable address within debug info.
-    virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+  /// \brief This implementation is used for Windows targets on x86 and x86-64.
+  class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
+    const MCExpr *
+    getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
+                                const TargetMachine &TM) const override;
   };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index f88a666..c04964d 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -17,10 +17,14 @@
 #define DEBUG_TYPE "x86tti"
 #include "X86.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -30,9 +34,20 @@ namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
 }
 
+static cl::opt<bool>
+UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
+  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
+  cl::desc("Threshold for taken branches in X86 partial unrolling"),
+  cl::Hidden);
+
 namespace {
 
-class X86TTI : public ImmutablePass, public TargetTransformInfo {
+class X86TTI final : public ImmutablePass, public TargetTransformInfo {
   const X86Subtarget *ST;
   const X86TargetLowering *TLI;
 
@@ -46,20 +61,16 @@ public:
   }
 
   X86TTI(const X86TargetMachine *TM)
-      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
+    : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+      TLI(TM->getTargetLowering()) {
     initializeX86TTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() {
+  void initializePass() override {
     pushTTIStack(this);
   }
 
-  virtual void finalizePass() {
-    popTTIStack();
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -67,7 +78,7 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;
@@ -75,35 +86,43 @@ public:
 
   /// \name Scalar TTI Implementations
   /// @{
-  virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 
   /// \name Vector TTI Implementations
   /// @{
 
-  virtual unsigned getNumberOfRegisters(bool Vector) const;
-  virtual unsigned getRegisterBitWidth(bool Vector) const;
-  virtual unsigned getMaximumUnrollFactor() const;
-  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind,
-                                          OperandValueKind) const;
-  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                                  int Index, Type *SubTp) const;
-  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const;
-  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const;
-  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const;
-  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const;
-
-  virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;
-  
-  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
-                                    bool IsPairwiseForm) const;
+  unsigned getNumberOfRegisters(bool Vector) const override;
+  unsigned getRegisterBitWidth(bool Vector) const override;
+  unsigned getMaximumUnrollFactor() const override;
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
+                                  OperandValueKind) const override;
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index, Type *SubTp) const override;
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                            Type *Src) const override;
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy) const override;
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) const override;
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+
+  unsigned getAddressComputationCost(Type *PtrTy,
+                                     bool IsComplex) const override;
+
+  unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                            bool IsPairwiseForm) const override;
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
 
   /// @}
 };
@@ -134,6 +153,93 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
+void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (!UsePartialUnrolling)
+    return;
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  unsigned MaxBranches, MaxOps;
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
+    MaxBranches = PartialUnrollingMaxBranches;
+    MaxOps = PartialUnrollingThreshold;
+  } else if (ST->isAtom()) {
+    // On the Atom, the throughput for taken branches is 2 cycles. For small
+    // simple loops, expand by a small factor to hide the backedge cost.
+    MaxBranches = 2;
+    MaxOps = 10;
+  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
+    MaxBranches = 16;
+    MaxOps = 40;
+  } else if (ST->hasFMA4() /* Any other recent AMD */) {
+    return;
+  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
+    MaxBranches = 8;
+    MaxOps = 28;
+  } else if (ST->hasSSSE3() /* Intel Core */) {
+    MaxBranches = 4;
+    MaxOps = 18;
+  } else {
+    return;
+  }
+
+  // Scan the loop: don't unroll loops with calls, and count the potential
+  // number of taken branches (this is somewhat conservative because we're
+  // counting all block transitions as potential branches while in reality some
+  // of these will become implicit via block placement).
+  unsigned MaxDepth = 0;
+  for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
+       DE = df_end(L->getHeader()); DI != DE;) {
+    if (!L->contains(*DI)) {
+      DI.skipChildren();
+      continue;
+    }
+
+    MaxDepth = std::max(MaxDepth, DI.getPathLength());
+    if (MaxDepth > MaxBranches)
+      return;
+
+    for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        ImmutableCallSite CS(I);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+
+    ++DI;
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+
+  // Set the maximum count based on the loop depth. The maximum number of
+  // branches taken in a loop (including the backedge) is equal to the maximum
+  // loop depth (the DFS path length from the loop header to any block in the
+  // loop). When the loop is unrolled, this depth (except for the backedge
+  // itself) is multiplied by the unrolling factor. This new unrolled depth
+  // must be less than the target-specific maximum branch count (which limits
+  // the number of taken branches in the uop buffer).
+  if (MaxDepth > 1)
+    UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
+}
+
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
@@ -214,6 +320,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX2 lowering tricks.
   if (ST->hasAVX2()) {
+    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      // On AVX2, a packed v16i16 shift left by a constant build_vector
+      // is lowered into a vector multiply (vpmullw).
+      return LT.first;
+
     int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX2CostTable[Idx].Cost;
@@ -246,6 +359,20 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
   }
 
+  if (ISD == ISD::SHL &&
+      Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+    EVT VT = LT.second;
+    if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+        (VT == MVT::v4i32 && ST->hasSSE41()))
+      // Vector shift left by non uniform constant can be lowered
+      // into vector multiply (pmullw/pmulld).
+      return LT.first;
+    if (VT == MVT::v4i32 && ST->hasSSE2())
+      // A vector shift left by non uniform constant is converted
+      // into a vector multiply; the new multiply is eventually
+      // lowered into a sequence of shuffles and 2 x pmuludq.
+      ISD = ISD::MUL;
+  }
 
   static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
@@ -260,6 +387,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SHL,  MVT::v8i16,  8*10 }, // Scalarized.
     { ISD::SHL,  MVT::v4i32,  2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v2i64,  2*10 }, // Scalarized.
+    { ISD::SHL,  MVT::v4i64,  4*10 }, // Scalarized. 
 
     { ISD::SRL,  MVT::v16i8,  16*10 }, // Scalarized.
     { ISD::SRL,  MVT::v8i16,  8*10 }, // Scalarized.
@@ -297,6 +425,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v16i16,   4 },
     { ISD::MUL,     MVT::v8i32,    4 },
     { ISD::SUB,     MVT::v8i32,    4 },
     { ISD::ADD,     MVT::v8i32,    4 },
@@ -312,7 +441,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX1 lowering tricks.
   if (ST->hasAVX() && !ST->hasAVX2()) {
-    int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);
+    EVT VT = LT.second;
+
+    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+    // sequence of extract + two vector multiply + insert.
+    if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&
+        Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)
+      ISD = ISD::MUL;
+
+    int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
     if (Idx != -1)
       return LT.first * AVX1CostTable[Idx].Cost;
   }
@@ -332,7 +469,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   // 2x pmuludq, 2x shuffle.
   if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
       !ST->hasSSE41())
-    return 6;
+    return LT.first * 6;
 
   // Fallback to the default implementation.
   return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
@@ -400,16 +537,58 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 
   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVX2ConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
+  };
+
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
   AVXConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
-    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
-    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
-    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
-    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
-    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
-    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
 
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
@@ -436,17 +615,32 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
-
-    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
+    // The generic code to compute the scalar overhead is currently broken.
+    // Workaround this limitation by estimating the scalarization overhead
+    // here. We have roughly 10 instructions per scalar element.
+    // Multiply that by the vector width.
+    // FIXME: remove that when PR19268 is fixed.
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 4*10 },
+
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
-    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
-    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
-    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1,  8 },
-    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,  6 },
-    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
-    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
+    // This node is expanded into scalarized operations but BasicTTI is overly
+    // optimistic estimating its cost.  It computes 3 per element (one
+    // vector-extract, one scalar conversion and one vector-insert).  The
+    // problem is that the inserts form a read-modify-write chain so latency
+    // should be factored in too.  Inflating the cost per element by 1.
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
   };
 
+  if (ST->hasAVX2()) {
+    int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVX2ConversionTbl[Idx].Cost;
+  }
+
   if (ST->hasAVX()) {
     int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
                                      SrcTy.getSimpleVT());
@@ -555,7 +749,7 @@ unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
 
 unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const {
-  // Handle non power of two vectors such as <3 x float>
+  // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
 
@@ -570,7 +764,7 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
       // Cost = 128 bit store + unpack + 64 bit store.
       return 3;
 
-    // Assume that all other non power-of-two numbers are scalarized.
+    // Assume that all other non-power-of-two numbers are scalarized.
     if (!isPowerOf2_32(NumElem)) {
       unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
                                                            VTy->getScalarType(),
@@ -692,3 +886,109 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
   return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
 }
 
+unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  if (Imm == 0)
+    return TCC_Free;
+
+  if (Imm.getBitWidth() <= 64 &&
+      (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue())))
+    return TCC_Basic;
+  else
+    return 2 * TCC_Basic;
+}
+
+unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                               Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default: return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if ((Idx == ImmIdx) &&
+      Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+    return TCC_Free;
+
+  return X86TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                               const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  switch (IID) {
+  default: return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return X86TTI::getIntImmCost(Imm, Ty);
+}
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 66ae9c2..d4341b9 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -17,6 +17,7 @@
 #define DEBUG_TYPE "x86-vzeroupper"
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,73 +31,59 @@ using namespace llvm;
 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
 
 namespace {
-  struct VZeroUpperInserter : public MachineFunctionPass {
-    static char ID;
-    VZeroUpperInserter() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
 
-    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+  class VZeroUpperInserter : public MachineFunctionPass {
+  public:
 
-    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+    bool runOnMachineFunction(MachineFunction &MF) override;
+    const char *getPassName() const override {return "X86 vzeroupper inserter";}
 
   private:
-    const TargetInstrInfo *TII; // Machine instruction info.
-
-    // Any YMM register live-in to this function?
-    bool FnHasLiveInYmm;
-
-    // BBState - Contains the state of each MBB: unknown, clean, dirty
-    SmallVector<uint8_t, 8> BBState;
 
-    // BBSolved - Keep track of all MBB which had been already analyzed
-    // and there is no further processing required.
-    BitVector BBSolved;
-
-    // Machine Basic Blocks are classified according this pass:
-    //
-    //  ST_UNKNOWN - The MBB state is unknown, meaning from the entry state
-    //    until the MBB exit there isn't a instruction using YMM to change
-    //    the state to dirty, or one of the incoming predecessors is unknown
-    //    and there's not a dirty predecessor between them.
-    //
-    //  ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have
-    //    instructions using YMM and be marked ST_CLEAN, as long as the state
-    //    is cleaned by a vzeroupper before any call.
+    void processBasicBlock(MachineBasicBlock &MBB);
+    void insertVZeroUpper(MachineBasicBlock::iterator I,
+                          MachineBasicBlock &MBB);
+    void addDirtySuccessor(MachineBasicBlock &MBB);
+
+    typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+    static const char* getBlockExitStateName(BlockExitState ST);
+
+    // Core algorithm state:
+    // BlockState - Each block is either:
+    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //                   vzeroupper instructions in this block.
+    //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+    //                  block that will ensure that YMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  subsequent vzeroupper in the block clears it.
     //
-    //  ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a
-    //    vzeroupper instruction.
+    // AddedToDirtySuccessors - This flag is raised when a block is added to the
+    //                          DirtySuccessors list to ensure that it's not
+    //                          added multiple times.
     //
-    //  ST_INIT - Placeholder for an empty state set
-    //
-    enum {
-      ST_UNKNOWN = 0,
-      ST_CLEAN   = 1,
-      ST_DIRTY   = 2,
-      ST_INIT    = 3
+    // FirstUnguardedCall - Records the location of the first unguarded call in
+    //                      each basic block that may need to be guarded by a
+    //                      vzeroupper. We won't know whether it actually needs
+    //                      to be guarded until we discover a predecessor that
+    //                      is DIRTY_OUT.
+    struct BlockState {
+      BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
+      BlockExitState ExitState;
+      bool AddedToDirtySuccessors;
+      MachineBasicBlock::iterator FirstUnguardedCall;
     };
+    typedef SmallVector<BlockState, 8> BlockStateMap;
+    typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
 
-    // computeState - Given two states, compute the resulting state, in
-    // the following way
-    //
-    //  1) One dirty state yields another dirty state
-    //  2) All states must be clean for the result to be clean
-    //  3) If none above and one unknown, the result state is also unknown
-    //
-    static unsigned computeState(unsigned PrevState, unsigned CurState) {
-      if (PrevState == ST_INIT)
-        return CurState;
-
-      if (PrevState == ST_DIRTY || CurState == ST_DIRTY)
-        return ST_DIRTY;
-
-      if (PrevState == ST_CLEAN && CurState == ST_CLEAN)
-        return ST_CLEAN;
-
-      return ST_UNKNOWN;
-    }
+    BlockStateMap BlockStates;
+    DirtySuccessorsWorkList DirtySuccessors;
+    bool EverMadeChange;
+    const TargetInstrInfo *TII;
 
+    static char ID;
   };
+
   char VZeroUpperInserter::ID = 0;
 }
 
@@ -104,29 +91,30 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
   return new VZeroUpperInserter();
 }
 
-static bool isYmmReg(unsigned Reg) {
-  return (Reg >= X86::YMM0 && Reg <= X86::YMM31);
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+  switch (ST) {
+    case PASS_THROUGH: return "Pass-through";
+    case EXITS_DIRTY: return "Exits-dirty";
+    case EXITS_CLEAN: return "Exits-clean";
+  }
+  llvm_unreachable("Invalid block exit state.");
 }
 
-static bool isZmmReg(unsigned Reg) {
-  return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31);
+static bool isYmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
 }
 
 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
        E = MRI.livein_end(); I != E; ++I)
-    if (isYmmReg(I->first) || isZmmReg(I->first))
+    if (isYmmReg(I->first))
       return true;
 
   return false;
 }
 
 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
-  for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
-    if (!MO.clobbersPhysReg(reg))
-      return false;
-  }
-  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
@@ -150,16 +138,13 @@ static bool hasYmmReg(MachineInstr *MI) {
 
 /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
 /// instruction.
-static bool clobbersAnyYmmReg(MachineInstr *MI) {
+static bool callClobbersAnyYmmReg(MachineInstr *MI) {
+  assert(MI->isCall() && "Can only be called on call instructions.");
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isRegMask())
       continue;
-    for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
-      if (MO.clobbersPhysReg(reg))
-        return true;
-    }
-    for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
       if (MO.clobbersPhysReg(reg))
         return true;
     }
@@ -167,102 +152,44 @@ static bool clobbersAnyYmmReg(MachineInstr *MI) {
   return false;
 }
 
-/// runOnMachineFunction - Loop over all of the basic blocks, inserting
-/// vzero upper instructions before function calls.
-bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getTarget().getInstrInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  bool EverMadeChange = false;
-
-  // Fast check: if the function doesn't use any ymm registers, we don't need
-  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
-  // cheap in the common case of no ymm use.
-  bool YMMUsed = false;
-  const TargetRegisterClass *RC = &X86::VR256RegClass;
-  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
-       i != e; i++) {
-    if (!MRI.reg_nodbg_empty(*i)) {
-      YMMUsed = true;
-      break;
-    }
-  }
-  if (!YMMUsed)
-    return EverMadeChange;
-
-  // Pre-compute the existence of any live-in YMM registers to this function
-  FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
-
-  assert(BBState.empty());
-  BBState.resize(MF.getNumBlockIDs(), 0);
-  BBSolved.resize(MF.getNumBlockIDs(), 0);
-
-  // Each BB state depends on all predecessors, loop over until everything
-  // converges.  (Once we converge, we can implicitly mark everything that is
-  // still ST_UNKNOWN as ST_CLEAN.)
-  while (1) {
-    bool MadeChange = false;
-
-    // Process all basic blocks.
-    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-      MadeChange |= processBasicBlock(MF, *I);
+// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+                                              MachineBasicBlock &MBB) {
+  DebugLoc dl = I->getDebugLoc();
+  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+  ++NumVZU;
+  EverMadeChange = true;
+}
 
-    // If this iteration over the code changed anything, keep iterating.
-    if (!MadeChange) break;
-    EverMadeChange = true;
+// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+    DirtySuccessors.push_back(&MBB);
+    BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
   }
-
-  BBState.clear();
-  BBSolved.clear();
-  return EverMadeChange;
 }
 
 /// processBasicBlock - Loop over all of the instructions in the basic block,
 /// inserting vzero upper instructions before function calls.
-bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
-                                           MachineBasicBlock &BB) {
-  bool Changed = false;
-  unsigned BBNum = BB.getNumber();
-
-  // Don't process already solved BBs
-  if (BBSolved[BBNum])
-    return false; // No changes
-
-  // Check the state of all predecessors
-  unsigned EntryState = ST_INIT;
-  for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(),
-       PE = BB.pred_end(); PI != PE; ++PI) {
-    EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]);
-    if (EntryState == ST_DIRTY)
-      break;
-  }
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 
+  // Start by assuming that the block PASS_THROUGH, which implies no unguarded
+  // calls.
+  BlockExitState CurState = PASS_THROUGH;
+  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
 
-  // The entry MBB for the function may set the initial state to dirty if
-  // the function receives any YMM incoming arguments
-  if (&BB == MF.begin()) {
-    EntryState = ST_CLEAN;
-    if (FnHasLiveInYmm)
-      EntryState = ST_DIRTY;
-  }
-
-  // The current state is initialized according to the predecessors
-  unsigned CurState = EntryState;
-  bool BBHasCall = false;
-
-  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
-    DebugLoc dl = I->getDebugLoc();
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
     MachineInstr *MI = I;
-
     bool isControlFlow = MI->isCall() || MI->isReturn();
 
     // Shortcut: don't need to check regular instructions in dirty state.
-    if (!isControlFlow && CurState == ST_DIRTY)
+    if (!isControlFlow && CurState == EXITS_DIRTY)
       continue;
 
     if (hasYmmReg(MI)) {
       // We found a ymm-using instruction; this could be an AVX instruction,
       // or it could be control flow.
-      CurState = ST_DIRTY;
+      CurState = EXITS_DIRTY;
       continue;
     }
 
@@ -276,11 +203,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
     // standard calling convention is not used (RegMask is not used to mark
     // register clobbered and register usage (def/imp-def/use) is well-dfined
     // and explicitly specified.
-    if (MI->isCall() && !clobbersAnyYmmReg(MI))
+    if (MI->isCall() && !callClobbersAnyYmmReg(MI))
       continue;
 
-    BBHasCall = true;
-
     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
     // registers. This instruction has zero latency. In addition, the processor
     // changes back to Clean state, after which execution of Intel SSE
@@ -289,38 +214,101 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
     // execute SSE code.
     // FIXME: In some cases, we may want to move the VZEROUPPER into a
     // predecessor block.
-    if (CurState == ST_DIRTY) {
-      // Only insert the VZEROUPPER in case the entry state isn't unknown.
-      // When unknown, only compute the information within the block to have
-      // it available in the exit if possible, but don't change the block.
-      if (EntryState != ST_UNKNOWN) {
-        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
-        ++NumVZU;
-      }
-
+    if (CurState == EXITS_DIRTY) {
       // After the inserted VZEROUPPER the state becomes clean again, but
       // other YMM may appear before other subsequent calls or even before
       // the end of the BB.
-      CurState = ST_CLEAN;
+      insertVZeroUpper(I, MBB);
+      CurState = EXITS_CLEAN;
+    } else if (CurState == PASS_THROUGH) {
+      // If this block is currently in pass-through state and we encounter a
+      // call then whether we need a vzeroupper or not depends on whether this
+      // block has successors that exit dirty. Record the location of the call,
+      // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+      // It will be inserted later if necessary.
+      BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
+      CurState = EXITS_CLEAN;
     }
   }
 
-  DEBUG(dbgs() << "MBB #" << BBNum
-               << ", current state: " << CurState << '\n');
+  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+               << getBlockExitStateName(CurState) << '\n');
+
+  if (CurState == EXITS_DIRTY)
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                          SE = MBB.succ_end();
+         SI != SE; ++SI)
+      addDirtySuccessor(**SI);
+
+  BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getTarget().getSubtarget<X86Subtarget>().hasAVX512())
+    return false;
+  TII = MF.getTarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  EverMadeChange = false;
 
-  // A BB can only be considered solved when we both have done all the
-  // necessary transformations, and have computed the exit state.  This happens
-  // in two cases:
-  //  1) We know the entry state: this immediately implies the exit state and
-  //     all the necessary transformations.
-  //  2) There are no calls, and and a non-call instruction marks this block:
-  //     no transformations are necessary, and we know the exit state.
-  if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN))
-    BBSolved[BBNum] = true;
+  // Fast check: if the function doesn't use any ymm registers, we don't need
+  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
+  // cheap in the common case of no ymm use.
+  bool YMMUsed = false;
+  const TargetRegisterClass *RC = &X86::VR256RegClass;
+  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
+       i != e; i++) {
+    if (!MRI.reg_nodbg_empty(*i)) {
+      YMMUsed = true;
+      break;
+    }
+  }
+  if (!YMMUsed) {
+    return false;
+  }
 
-  if (CurState != BBState[BBNum])
-    Changed = true;
+  assert(BlockStates.empty() && DirtySuccessors.empty() &&
+         "X86VZeroUpper state should be clear");
+  BlockStates.resize(MF.getNumBlockIDs());
+
+  // Process all blocks. This will compute block exit states, record the first
+  // unguarded call in each block, and add successors of dirty blocks to the
+  // DirtySuccessors list.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    processBasicBlock(*I);
+
+  // If any YMM regs are live in to this function, add the entry block to the
+  // DirtySuccessors list
+  if (checkFnHasLiveInYmm(MRI))
+    addDirtySuccessor(MF.front());
+
+  // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
+  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+  // through PASS_THROUGH blocks.
+  while (!DirtySuccessors.empty()) {
+    MachineBasicBlock &MBB = *DirtySuccessors.back();
+    DirtySuccessors.pop_back();
+    BlockState &BBState = BlockStates[MBB.getNumber()];
+
+    // MBB is a successor of a dirty block, so its first call needs to be
+    // guarded.
+    if (BBState.FirstUnguardedCall != MBB.end())
+      insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+    // If this successor was a pass-through block then it is now dirty, and its
+    // successors need to be added to the worklist (if they haven't been
+    // already).
+    if (BBState.ExitState == PASS_THROUGH) {
+      DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+                   << " was Pass-through, is now Dirty-out.\n");
+      for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                            SE = MBB.succ_end();
+           SI != SE; ++SI)
+        addDirtySuccessor(**SI);
+    }
+  }
 
-  BBState[BBNum] = CurState;
-  return Changed;
+  BlockStates.clear();
+  return EverMadeChange;
 }
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 3fa3b34..5ad0754 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -24,10 +24,9 @@ add_llvm_target(XCoreCodeGen
   XCoreTargetObjectFile.cpp
   XCoreTargetTransformInfo.cpp
   XCoreSelectionDAGInfo.cpp
+  XCoreFrameToArgsOffsetElim.cpp
   )
 
-add_dependencies(LLVMXCoreCodeGen XCoreCommonTableGen intrinsics_gen)
-
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/XCore/Disassembler/CMakeLists.txt b/lib/Target/XCore/Disassembler/CMakeLists.txt
index cdc5d99..1ed10c0 100644
--- a/lib/Target/XCore/Disassembler/CMakeLists.txt
+++ b/lib/Target/XCore/Disassembler/CMakeLists.txt
@@ -1,5 +1,3 @@
 add_llvm_library(LLVMXCoreDisassembler
   XCoreDisassembler.cpp
   )
-
-add_dependencies(LLVMXCoreDisassembler XCoreCommonTableGen)
diff --git a/lib/Target/XCore/InstPrinter/CMakeLists.txt b/lib/Target/XCore/InstPrinter/CMakeLists.txt
index 930e733..53cf84d 100644
--- a/lib/Target/XCore/InstPrinter/CMakeLists.txt
+++ b/lib/Target/XCore/InstPrinter/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMXCoreAsmPrinter
   XCoreInstPrinter.cpp
   )
-
-add_dependencies(LLVMXCoreAsmPrinter XCoreCommonTableGen)
diff --git a/lib/Target/XCore/LLVMBuild.txt b/lib/Target/XCore/LLVMBuild.txt
index 59e64ad..0504e8a 100644
--- a/lib/Target/XCore/LLVMBuild.txt
+++ b/lib/Target/XCore/LLVMBuild.txt
@@ -29,5 +29,5 @@ has_disassembler = 1
 type = Library
 name = XCoreCodeGen
 parent = XCore
-required_libraries = AsmPrinter CodeGen Core MC SelectionDAG Support Target XCoreDesc XCoreInfo
+required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target TransformUtils XCoreAsmPrinter XCoreDesc XCoreInfo
 add_to_library_groups = XCore
diff --git a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
index 3a3f5b4..a14cf5c 100644
--- a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
@@ -2,8 +2,3 @@ add_llvm_library(LLVMXCoreDesc
   XCoreMCTargetDesc.cpp
   XCoreMCAsmInfo.cpp
   )
-
-add_dependencies(LLVMXCoreDesc XCoreCommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt b/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
index 8213f9e..6d390d2 100644
--- a/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = XCoreDesc
 parent = XCore
-required_libraries = MC XCoreAsmPrinter XCoreInfo
+required_libraries = MC Support XCoreAsmPrinter XCoreInfo
 add_to_library_groups = XCore
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 3d1c474..f788c59 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -20,8 +20,7 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
   Data64bitsDirective = 0;
   ZeroDirective = "\t.space\t";
   CommentString = "#";
-    
-  PrivateGlobalPrefix = ".L";
+
   AscizDirective = ".asciiz";
 
   HiddenVisibilityAttr = MCSA_Invalid;
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 10bb6df..439d0ab 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -14,11 +14,13 @@
 #include "XCoreMCTargetDesc.h"
 #include "InstPrinter/XCoreInstPrinter.h"
 #include "XCoreMCAsmInfo.h"
+#include "XCoreTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -69,6 +71,12 @@ static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   if (RM == Reloc::Default) {
     RM = Reloc::Static;
   }
+  if (CM == CodeModel::Default) {
+    CM = CodeModel::Small;
+  }
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error("Target only supports CodeModel Small or Large");
+
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
@@ -82,6 +90,54 @@ static MCInstPrinter *createXCoreMCInstPrinter(const Target &T,
   return new XCoreInstPrinter(MAI, MII, MRI);
 }
 
+XCoreTargetStreamer::XCoreTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+XCoreTargetStreamer::~XCoreTargetStreamer() {}
+
+namespace {
+
+class XCoreTargetAsmStreamer : public XCoreTargetStreamer {
+  formatted_raw_ostream &OS;
+public:
+  XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  virtual void emitCCTopData(StringRef Name) override;
+  virtual void emitCCTopFunction(StringRef Name) override;
+  virtual void emitCCBottomData(StringRef Name) override;
+  virtual void emitCCBottomFunction(StringRef Name) override;
+};
+
+XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : XCoreTargetStreamer(S), OS(OS) {}
+
+void XCoreTargetAsmStreamer::emitCCTopData(StringRef Name) {
+  OS << "\t.cc_top " << Name << ".data," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCTopFunction(StringRef Name) {
+  OS << "\t.cc_top " << Name << ".function," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) {
+  OS << "\t.cc_bottom " << Name << ".data\n";
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
+  OS << "\t.cc_bottom " << Name << ".function\n";
+}
+}
+
+static MCStreamer *
+createXCoreMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                         bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                         MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                         MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S =
+      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
+                              InstPrint, CE, TAB, ShowInst);
+  new XCoreTargetAsmStreamer(*S, OS);
+  return S;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeXCoreTargetMC() {
   // Register the MC asm info.
@@ -104,4 +160,6 @@ extern "C" void LLVMInitializeXCoreTargetMC() {
   // Register the MCInstPrinter
   TargetRegistry::RegisterMCInstPrinter(TheXCoreTarget,
                                         createXCoreMCInstPrinter);
+
+  TargetRegistry::RegisterAsmStreamer(TheXCoreTarget, createXCoreMCAsmStreamer);
 }
diff --git a/lib/Target/XCore/TargetInfo/CMakeLists.txt b/lib/Target/XCore/TargetInfo/CMakeLists.txt
index 2c34b87..462f2d4 100644
--- a/lib/Target/XCore/TargetInfo/CMakeLists.txt
+++ b/lib/Target/XCore/TargetInfo/CMakeLists.txt
@@ -1,7 +1,3 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMXCoreInfo
   XCoreTargetInfo.cpp
   )
-
-add_dependencies(LLVMXCoreInfo XCoreCommonTableGen)
diff --git a/lib/Target/XCore/TargetInfo/LLVMBuild.txt b/lib/Target/XCore/TargetInfo/LLVMBuild.txt
index 770ba87..45ff75f 100644
--- a/lib/Target/XCore/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/XCore/TargetInfo/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = XCoreInfo
 parent = XCore
-required_libraries = MC Support Target
+required_libraries = Support
 add_to_library_groups = XCore
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index 73c310b..d707edc 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -27,6 +27,7 @@ namespace llvm {
 
   void initializeXCoreLowerThreadLocalPass(PassRegistry &p);
 
+  FunctionPass *createXCoreFrameToArgsOffsetEliminationPass();
   FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
                                    CodeGenOpt::Level OptLevel);
   ModulePass *createXCoreLowerThreadLocalPass();
diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td
index e9a6d88..04a1dd5 100644
--- a/lib/Target/XCore/XCore.td
+++ b/lib/Target/XCore/XCore.td
@@ -41,13 +41,7 @@ def : Proc<"xs1b-generic", []>;
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
 
-def XCoreAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
 def XCore : Target {
   // Pull in Instruction Info:
   let InstructionSet = XCoreInstrInfo;
-  let AssemblyWriters = [XCoreAsmWriter];
 }
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index c03dfe6..21acedf 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -19,6 +19,7 @@
 #include "XCoreMCInstLower.h"
 #include "XCoreSubtarget.h"
 #include "XCoreTargetMachine.h"
+#include "XCoreTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -27,20 +28,20 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include <algorithm>
 #include <cctype>
@@ -50,6 +51,8 @@ namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
     XCoreMCInstLower MCInstLowering;
+    XCoreTargetStreamer &getTargetStreamer();
+
   public:
     explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
@@ -68,6 +71,9 @@ namespace {
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
                          raw_ostream &O);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
 
     void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
     virtual void EmitGlobalVariable(const GlobalVariable *GV);
@@ -79,10 +85,14 @@ namespace {
   };
 } // end of anonymous namespace
 
+XCoreTargetStreamer &XCoreAsmPrinter::getTargetStreamer() {
+  return static_cast<XCoreTargetStreamer&>(*OutStreamer.getTargetStreamer());
+}
+
 void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
-  assert(((GV->hasExternalLinkage() ||
-    GV->hasWeakLinkage()) ||
-    GV->hasLinkOnceLinkage()) && "Unexpected linkage");
+  assert( ( GV->hasExternalLinkage() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage() || GV->hasCommonLinkage() ) &&
+          "Unexpected linkage");
   if (ArrayType *ATy = dyn_cast<ArrayType>(
                         cast<PointerType>(GV->getType())->getElementType())) {
 
@@ -92,7 +102,8 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
     OutStreamer.EmitAssignment(SymGlob,
                                MCConstantExpr::Create(ATy->getNumElements(),
                                                       OutContext));
-    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+        GV->hasCommonLinkage()) {
       // TODO Use COMDAT groups for LinkOnceLinkage
       OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Weak);
     }
@@ -106,16 +117,15 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     return;
 
   const DataLayout *TD = TM.getDataLayout();
-  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
+  OutStreamer.SwitchSection(
+      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
 
-  
   MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
   unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
   
   // Mark the start of the global
-  OutStreamer.EmitRawText("\t.cc_top " + Twine(GVSym->getName()) + ".data," +
-                          GVSym->getName());
+  getTargetStreamer().emitCCTopData(GVSym->getName());
 
   switch (GV->getLinkage()) {
   case GlobalValue::AppendingLinkage:
@@ -125,20 +135,18 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::ExternalLinkage:
+  case GlobalValue::CommonLinkage:
     emitArrayBound(GVSym, GV);
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
     // TODO Use COMDAT groups for LinkOnceLinkage
-    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage())
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+        GV->hasCommonLinkage())
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
     // FALL THROUGH
   case GlobalValue::InternalLinkage:
   case GlobalValue::PrivateLinkage:
     break;
-  case GlobalValue::DLLImportLinkage:
-    llvm_unreachable("DLLImport linkage is not supported by this target!");
-  case GlobalValue::DLLExportLinkage:
-    llvm_unreachable("DLLExport linkage is not supported by this target!");
   default:
     llvm_unreachable("Unknown linkage type!");
   }
@@ -151,8 +159,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   unsigned Size = TD->getTypeAllocSize(C->getType());
   if (MAI->hasDotTypeDotSizeDirective()) {
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
-    OutStreamer.EmitRawText("\t.size " + Twine(GVSym->getName()) + "," +
-                            Twine(Size));
+    OutStreamer.EmitELFSize(GVSym, MCConstantExpr::Create(Size, OutContext));
   }
   OutStreamer.EmitLabel(GVSym);
   
@@ -163,7 +170,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     OutStreamer.EmitZeros(4 - Size);
   
   // Mark the end of the global
-  OutStreamer.EmitRawText("\t.cc_bottom " + Twine(GVSym->getName()) + ".data");
+  getTargetStreamer().emitCCBottomData(GVSym->getName());
 }
 
 void XCoreAsmPrinter::EmitFunctionBodyStart() {
@@ -174,14 +181,12 @@ void XCoreAsmPrinter::EmitFunctionBodyStart() {
 /// the last basic block in the function.
 void XCoreAsmPrinter::EmitFunctionBodyEnd() {
   // Emit function end directives
-  OutStreamer.EmitRawText("\t.cc_bottom " + Twine(CurrentFnSym->getName()) +
-                          ".function");
+  getTargetStreamer().emitCCBottomFunction(CurrentFnSym->getName());
 }
 
 void XCoreAsmPrinter::EmitFunctionEntryLabel() {
   // Mark the start of the function
-  OutStreamer.EmitRawText("\t.cc_top " + Twine(CurrentFnSym->getName()) +
-                          ".function," + CurrentFnSym->getName());
+  getTargetStreamer().emitCCTopFunction(CurrentFnSym->getName());
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
@@ -204,6 +209,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
 
 void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
+  const DataLayout *DL = TM.getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
@@ -218,15 +224,8 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MachineOperand::MO_GlobalAddress:
     O << *getSymbol(MO.getGlobal());
     break;
-  case MachineOperand::MO_ExternalSymbol:
-    O << MO.getSymbolName();
-    break;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
-      << '_' << MO.getIndex();
-    break;
-  case MachineOperand::MO_JumpTableIndex:
-    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     break;
   case MachineOperand::MO_BlockAddress:
@@ -252,6 +251,20 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
   return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
 }
 
+bool XCoreAsmPrinter::
+PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                      unsigned AsmVariant, const char *ExtraCode,
+                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printOperand(MI, OpNum, O);
+  O << '[';
+  printOperand(MI, OpNum + 1, O);
+  O << ']';
+  return false;
+}
+
 void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
@@ -284,7 +297,7 @@ void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
 
-  OutStreamer.EmitInstruction(TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 // Force static initialization.
diff --git a/lib/Target/XCore/XCoreCallingConv.td b/lib/Target/XCore/XCoreCallingConv.td
index b20d71f..e149e6d 100644
--- a/lib/Target/XCore/XCoreCallingConv.td
+++ b/lib/Target/XCore/XCoreCallingConv.td
@@ -14,7 +14,11 @@
 //===----------------------------------------------------------------------===//
 def RetCC_XCore : CallingConv<[
   // i32 are returned in registers R0, R1, R2, R3
-  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index c34b35c..954fddf 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -25,10 +25,15 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include <algorithm>    // std::sort
 
 using namespace llvm;
 
+static const unsigned FramePtr = XCore::R10;
+static const int MaxImmU16 = (1<<16) - 1;
+
 // helper functions. FIXME: Eliminate.
 static inline bool isImmU6(unsigned val) {
   return val < (1 << 6);
@@ -38,37 +43,164 @@ static inline bool isImmU16(unsigned val) {
   return val < (1 << 16);
 }
 
-static void loadFromStack(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator I,
-                          unsigned DstReg, int Offset, DebugLoc dl,
-                          const TargetInstrInfo &TII) {
-  assert(Offset%4 == 0 && "Misaligned stack offset");
-  Offset/=4;
-  bool isU6 = isImmU6(Offset);
-  if (!isU6 && !isImmU16(Offset))
-    report_fatal_error("loadFromStack offset too big " + Twine(Offset));
-  int Opcode = isU6 ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
-  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg)
-    .addImm(Offset);
+// Helper structure with compare function for handling stack slots.
+namespace {
+struct StackSlotInfo {
+  int FI;
+  int Offset;
+  unsigned Reg;
+  StackSlotInfo(int f, int o, int r) : FI(f), Offset(o), Reg(r){};
+};
+}  // end anonymous namespace
+
+static bool CompareSSIOffset(const StackSlotInfo& a, const StackSlotInfo& b) {
+  return a.Offset < b.Offset;
+}
+
+
+static void EmitDefCfaRegister(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                               const TargetInstrInfo &TII,
+                               MachineModuleInfo *MMI, unsigned DRegNum) {
+  unsigned CFIIndex = MMI->addFrameInst(
+      MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
+  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+}
+
+static void EmitDefCfaOffset(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                             const TargetInstrInfo &TII,
+                             MachineModuleInfo *MMI, int Offset) {
+  unsigned CFIIndex =
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+}
+
+static void EmitCfiOffset(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                          const TargetInstrInfo &TII, MachineModuleInfo *MMI,
+                          unsigned DRegNum, int Offset) {
+  unsigned CFIIndex = MMI->addFrameInst(
+      MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
+  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
+/// frame. During these steps, it may be necessary to spill registers.
+/// IfNeededExtSP emits the necessary EXTSP instructions to move the SP only
+/// as far as to make 'OffsetFromBottom' reachable using an STWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] Adjusted the current SP offset from the top of the frame.
+static void IfNeededExtSP(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                          const TargetInstrInfo &TII, MachineModuleInfo *MMI,
+                          int OffsetFromTop, int &Adjusted, int FrameSize,
+                          bool emitFrameMoves) {
+  while (OffsetFromTop > Adjusted) {
+    assert(Adjusted < FrameSize && "OffsetFromTop is beyond FrameSize");
+    int remaining = FrameSize - Adjusted;
+    int OpImm = (remaining > MaxImmU16) ? MaxImmU16 : remaining;
+    int Opcode = isImmU6(OpImm) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(OpImm);
+    Adjusted += OpImm;
+    if (emitFrameMoves)
+      EmitDefCfaOffset(MBB, MBBI, dl, TII, MMI, Adjusted*4);
+  }
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the top of the
+/// frame. During these steps, it may be necessary to re-load registers.
+/// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
+/// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the frame.
+static void IfNeededLDAWSP(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc dl,
+                           const TargetInstrInfo &TII, int OffsetFromTop,
+                           int &RemainingAdj) {
+  while (OffsetFromTop < RemainingAdj - MaxImmU16) {
+    assert(RemainingAdj && "OffsetFromTop is beyond FrameSize");
+    int OpImm = (RemainingAdj > MaxImmU16) ? MaxImmU16 : RemainingAdj;
+    int Opcode = isImmU6(OpImm) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(OpImm);
+    RemainingAdj -= OpImm;
+  }
+}
+
+/// Creates an ordered list of registers that are spilled
+/// during the emitPrologue/emitEpilogue.
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+                         MachineFrameInfo *MFI, XCoreFunctionInfo *XFI,
+                         bool fetchLR, bool fetchFP) {
+  if (fetchLR) {
+    int Offset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+    SpillList.push_back(StackSlotInfo(XFI->getLRSpillSlot(),
+                                      Offset,
+                                      XCore::LR));
+  }
+  if (fetchFP) {
+    int Offset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    SpillList.push_back(StackSlotInfo(XFI->getFPSpillSlot(),
+                                      Offset,
+                                      FramePtr));
+  }
+  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+/// Creates an ordered list of EH info register 'spills'.
+/// These slots are only used by the unwinder and calls to llvm.eh.return().
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+                           MachineFrameInfo *MFI, XCoreFunctionInfo *XFI,
+                           const TargetLowering *TL) {
+  assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots");
+  const int* EHSlot = XFI->getEHSpillSlot();
+  SpillList.push_back(StackSlotInfo(EHSlot[0],
+                                    MFI->getObjectOffset(EHSlot[0]),
+                                    TL->getExceptionPointerRegister()));
+  SpillList.push_back(StackSlotInfo(EHSlot[0],
+                                    MFI->getObjectOffset(EHSlot[1]),
+                                    TL->getExceptionSelectorRegister()));
+  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
 }
 
 
-static void storeToStack(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator I,
-                         unsigned SrcReg, int Offset, DebugLoc dl,
-                         const TargetInstrInfo &TII) {
-  assert(Offset%4 == 0 && "Misaligned stack offset");
-  Offset/=4;
-  bool isU6 = isImmU6(Offset);
-  if (!isU6 && !isImmU16(Offset))
-    report_fatal_error("storeToStack offset too big " + Twine(Offset));
-  int Opcode = isU6 ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
-  BuildMI(MBB, I, dl, TII.get(Opcode))
-    .addReg(SrcReg)
-    .addImm(Offset);
+static MachineMemOperand *
+getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) {
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
+                             flags, MFI.getObjectSize(FrameIndex),
+                             MFI.getObjectAlignment(FrameIndex));
+  return MMO;
 }
 
 
+/// Restore clobbered registers with their spill slot value.
+/// The SP will be adjusted at the same time, thus the SpillList must be ordered
+/// with the largest (negative) offsets first.
+static void
+RestoreSpillList(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                 DebugLoc dl, const TargetInstrInfo &TII, int &RemainingAdj,
+                 SmallVectorImpl<StackSlotInfo> &SpillList) {
+  for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+    assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+    assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+    int OffsetFromTop = - SpillList[i].Offset/4;
+    IfNeededLDAWSP(MBB, MBBI, dl, TII, OffsetFromTop, RemainingAdj);
+    int Offset = RemainingAdj - OffsetFromTop;
+    int Opcode = isImmU6(Offset) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode), SpillList[i].Reg)
+      .addImm(Offset)
+      .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+                                      MachineMemOperand::MOLoad));
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // XCoreFrameLowering:
 //===----------------------------------------------------------------------===//
@@ -80,7 +212,7 @@ XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
 
 bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
-    MF.getFrameInfo()->hasVarSizedObjects();
+         MF.getFrameInfo()->hasVarSizedObjects();
 }
 
 void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -98,213 +230,216 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     report_fatal_error("emitPrologue unsupported alignment: "
                        + Twine(MFI->getMaxAlignment()));
 
-  bool FP = hasFP(MF);
   const AttributeSet &PAL = MF.getFunction()->getAttributes();
-
   if (PAL.hasAttrSomewhere(Attribute::Nest))
-    loadFromStack(MBB, MBBI, XCore::R11, 0, dl, TII);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
+    // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
 
   // Work out frame sizes.
-  int FrameSize = MFI->getStackSize();
-  assert(FrameSize%4 == 0 && "Misaligned frame size");
-  FrameSize/=4;
-
-  bool isU6 = isImmU6(FrameSize);
-
-  if (!isU6 && !isImmU16(FrameSize)) {
-    // FIXME could emit multiple instructions.
-    report_fatal_error("emitPrologue Frame size too big: " + Twine(FrameSize));
-  }
+  // We will adjust the SP in stages towards the final FrameSize.
+  assert(MFI->getStackSize()%4 == 0 && "Misaligned frame size");
+  const int FrameSize = MFI->getStackSize() / 4;
+  int Adjusted = 0;
+
+  bool saveLR = XFI->hasLRSpillSlot();
+  bool UseENTSP = saveLR && FrameSize
+                  && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
+  if (UseENTSP)
+    saveLR = false;
+  bool FP = hasFP(MF);
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
 
-  bool saveLR = XFI->getUsesLR();
-  // Do we need to allocate space on the stack?
-  if (FrameSize) {
-    bool LRSavedOnEntry = false;
-    int Opcode;
-    if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
-      Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
-      MBB.addLiveIn(XCore::LR);
-      saveLR = false;
-      LRSavedOnEntry = true;
-    } else {
-      Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
-    }
-    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
-
+  if (UseENTSP) {
+    // Allocate space on the stack at the same time as saving LR.
+    Adjusted = (FrameSize > MaxImmU16) ? MaxImmU16 : FrameSize;
+    int Opcode = isImmU6(Adjusted) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+    MBB.addLiveIn(XCore::LR);
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode));
+    MIB.addImm(Adjusted);
+    MIB->addRegisterKilled(XCore::LR, MF.getTarget().getRegisterInfo(), true);
     if (emitFrameMoves) {
-      // Show update of SP.
-      MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
-      MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(FrameLabel,
-                                                             -FrameSize*4));
-      if (LRSavedOnEntry) {
-        unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
-        MMI->addFrameInst(MCCFIInstruction::createOffset(FrameLabel, Reg, 0));
-      }
+      EmitDefCfaOffset(MBB, MBBI, dl, TII, MMI, Adjusted*4);
+      unsigned DRegNum = MRI->getDwarfRegNum(XCore::LR, true);
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI, DRegNum, 0);
     }
   }
-  if (saveLR) {
-    int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
-    storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl, TII);
-    MBB.addLiveIn(XCore::LR);
 
+  // If necessary, save LR and FP to the stack, as we EXTSP.
+  SmallVector<StackSlotInfo,2> SpillList;
+  GetSpillList(SpillList, MFI, XFI, saveLR, FP);
+  // We want the nearest (negative) offsets first, so reverse list.
+  std::reverse(SpillList.begin(), SpillList.end());
+  for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+    assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+    assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+    int OffsetFromTop = - SpillList[i].Offset/4;
+    IfNeededExtSP(MBB, MBBI, dl, TII, MMI, OffsetFromTop, Adjusted, FrameSize,
+                  emitFrameMoves);
+    int Offset = Adjusted - OffsetFromTop;
+    int Opcode = isImmU6(Offset) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+    MBB.addLiveIn(SpillList[i].Reg);
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+      .addReg(SpillList[i].Reg, RegState::Kill)
+      .addImm(Offset)
+      .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+                                      MachineMemOperand::MOStore));
     if (emitFrameMoves) {
-      MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
-      unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
-      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveLRLabel, Reg,
-                                                       LRSpillOffset));
+      unsigned DRegNum = MRI->getDwarfRegNum(SpillList[i].Reg, true);
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI, DRegNum, SpillList[i].Offset);
     }
   }
 
+  // Complete any remaining Stack adjustment.
+  IfNeededExtSP(MBB, MBBI, dl, TII, MMI, FrameSize, Adjusted, FrameSize,
+                emitFrameMoves);
+  assert(Adjusted==FrameSize && "IfNeededExtSP has not completed adjustment");
+
   if (FP) {
-    // Save R10 to the stack.
-    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
-    storeToStack(MBB, MBBI, XCore::R10, FPSpillOffset + FrameSize*4, dl, TII);
-    // R10 is live-in. It is killed at the spill.
-    MBB.addLiveIn(XCore::R10);
-    if (emitFrameMoves) {
-      MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
-      unsigned Reg = MRI->getDwarfRegNum(XCore::R10, true);
-      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveR10Label, Reg,
-                                                       FPSpillOffset));
-    }
     // Set the FP from the SP.
-    unsigned FramePtr = XCore::R10;
     BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
-    if (emitFrameMoves) {
-      // Show FP is now valid.
-      MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
-      unsigned Reg = MRI->getDwarfRegNum(FramePtr, true);
-      MMI->addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
-                                                               Reg));
-    }
+    if (emitFrameMoves)
+      EmitDefCfaRegister(MBB, MBBI, dl, TII, MMI,
+                         MRI->getDwarfRegNum(FramePtr, true));
   }
 
   if (emitFrameMoves) {
     // Frame moves for callee saved.
-    std::vector<std::pair<MCSymbol*, CalleeSavedInfo> >&SpillLabels =
-        XFI->getSpillLabels();
+    auto SpillLabels = XFI->getSpillLabels();
     for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
-      MCSymbol *SpillLabel = SpillLabels[I].first;
+      MachineBasicBlock::iterator Pos = SpillLabels[I].first;
+      ++Pos;
       CalleeSavedInfo &CSI = SpillLabels[I].second;
       int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
-      unsigned Reg = MRI->getDwarfRegNum(CSI.getReg(), true);
-      MMI->addFrameInst(MCCFIInstruction::createOffset(SpillLabel, Reg,
-                                                       Offset));
+      unsigned DRegNum = MRI->getDwarfRegNum(CSI.getReg(), true);
+      EmitCfiOffset(MBB, Pos, dl, TII, MMI, DRegNum, Offset);
+    }
+    if (XFI->hasEHSpillSlot()) {
+      // The unwinder requires stack slot & CFI offsets for the exception info.
+      // We do not save/spill these registers.
+      SmallVector<StackSlotInfo,2> SpillList;
+      GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
+      assert(SpillList.size()==2 && "Unexpected SpillList size");
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
+                    MRI->getDwarfRegNum(SpillList[0].Reg, true),
+                    SpillList[0].Offset);
+      EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
+                    MRI->getDwarfRegNum(SpillList[1].Reg, true),
+                    SpillList[1].Offset);
     }
   }
 }
 
 void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
                                      MachineBasicBlock &MBB) const {
-  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI->getDebugLoc();
-
-  bool FP = hasFP(MF);
-  if (FP) {
-    // Restore the stack pointer.
-    unsigned FramePtr = XCore::R10;
-    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r))
-      .addReg(FramePtr);
-  }
+  unsigned RetOpcode = MBBI->getOpcode();
 
   // Work out frame sizes.
-  int FrameSize = MFI->getStackSize();
-
-  assert(FrameSize%4 == 0 && "Misaligned frame size");
-
-  FrameSize/=4;
-
-  bool isU6 = isImmU6(FrameSize);
-
-  if (!isU6 && !isImmU16(FrameSize)) {
-    // FIXME could emit multiple instructions.
-    report_fatal_error("emitEpilogue Frame size too big: " + Twine(FrameSize));
-  }
-
-  if (FP) {
-    // Restore R10
-    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
-    FPSpillOffset += FrameSize*4;
-    loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl, TII);
+  // We will adjust the SP in stages towards the final FrameSize.
+  int RemainingAdj = MFI->getStackSize();
+  assert(RemainingAdj%4 == 0 && "Misaligned frame size");
+  RemainingAdj /= 4;
+
+  if (RetOpcode == XCore::EH_RETURN) {
+    // 'Restore' the exception info the unwinder has placed into the stack slots.
+    SmallVector<StackSlotInfo,2> SpillList;
+    GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
+    RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+    // Return to the landing pad.
+    unsigned EhStackReg = MBBI->getOperand(0).getReg();
+    unsigned EhHandlerReg = MBBI->getOperand(1).getReg();
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg);
+    MBB.erase(MBBI);  // Erase the previous return instruction.
+    return;
   }
 
-  bool restoreLR = XFI->getUsesLR();
-  if (restoreLR &&
-      (FrameSize == 0 || MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0)) {
-    int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
-    LRSpillOffset += FrameSize*4;
-    loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl, TII);
+  bool restoreLR = XFI->hasLRSpillSlot();
+  bool UseRETSP = restoreLR && RemainingAdj
+                  && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
+  if (UseRETSP)
     restoreLR = false;
-  }
+  bool FP = hasFP(MF);
 
-  if (FrameSize) {
-    if (restoreLR) {
+  if (FP) // Restore the stack pointer.
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(FramePtr);
+
+  // If necessary, restore LR and FP from the stack, as we EXTSP.
+  SmallVector<StackSlotInfo,2> SpillList;
+  GetSpillList(SpillList, MFI, XFI, restoreLR, FP);
+  RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+  if (RemainingAdj) {
+    // Complete all but one of the remaining Stack adjustments.
+    IfNeededLDAWSP(MBB, MBBI, dl, TII, 0, RemainingAdj);
+    if (UseRETSP) {
       // Fold prologue into return instruction
-      assert(MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
-      assert(MBBI->getOpcode() == XCore::RETSP_u6
-        || MBBI->getOpcode() == XCore::RETSP_lu6);
-      int Opcode = (isU6) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
-      MachineInstrBuilder MIB  = BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+      assert(RetOpcode == XCore::RETSP_u6
+             || RetOpcode == XCore::RETSP_lu6);
+      int Opcode = isImmU6(RemainingAdj) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+                                  .addImm(RemainingAdj);
       for (unsigned i = 3, e = MBBI->getNumOperands(); i < e; ++i)
         MIB->addOperand(MBBI->getOperand(i)); // copy any variadic operands
-      MBB.erase(MBBI);
+      MBB.erase(MBBI);  // Erase the previous return instruction.
     } else {
-      int Opcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
-      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(FrameSize);
+      int Opcode = isImmU6(RemainingAdj) ? XCore::LDAWSP_ru6 :
+                                           XCore::LDAWSP_lru6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(RemainingAdj);
+      // Don't erase the return instruction.
     }
-  }
+  } // else Don't erase the return instruction.
 }
 
-bool XCoreFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                          const TargetRegisterInfo *TRI) const {
+bool XCoreFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return true;
 
   MachineFunction *MF = MBB.getParent();
   const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
-
   XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
 
   DebugLoc DL;
-  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
 
   for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
                                                     it != CSI.end(); ++it) {
-    // Add the callee-saved register as live-in. It's killed at the spill.
-    MBB.addLiveIn(it->getReg());
-
     unsigned Reg = it->getReg();
+    assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+           "LR & FP are always handled in emitPrologue");
+
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true,
-                            it->getFrameIdx(), RC, TRI);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, it->getFrameIdx(), RC, TRI);
     if (emitFrameMoves) {
-      MCSymbol *SaveLabel = MF->getContext().CreateTempSymbol();
-      BuildMI(MBB, MI, DL, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLabel);
-      XFI->getSpillLabels().push_back(std::make_pair(SaveLabel, *it));
+      auto Store = MI;
+      --Store;
+      XFI->getSpillLabels().push_back(std::make_pair(Store, *it));
     }
   }
   return true;
 }
 
-bool XCoreFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                                 MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                            const TargetRegisterInfo *TRI) const{
+bool XCoreFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const{
   MachineFunction *MF = MBB.getParent();
   const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
-
   bool AtStart = MI == MBB.begin();
   MachineBasicBlock::iterator BeforeI = MI;
   if (!AtStart)
@@ -312,9 +447,11 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
                                                     it != CSI.end(); ++it) {
     unsigned Reg = it->getReg();
+    assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+           "LR & FP are always handled in emitEpilogue");
+
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, it->getReg(), it->getFrameIdx(),
-                             RC, TRI);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, it->getFrameIdx(), RC, TRI);
     assert(MI != MBB.begin() &&
            "loadRegFromStackSlot didn't insert any code!");
     // Insert in reverse order.  loadRegFromStackSlot can insert multiple
@@ -381,40 +518,58 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void
-XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                     RegScavenger *RS) const {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+void XCoreFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+
+  if (!LRUsed && !MF.getFunction()->isVarArg() &&
+      MF.getFrameInfo()->estimateStackSize(MF))
+    // If we need to extend the stack it is more efficient to use entsp / retsp.
+    // We force the LR to be saved so these instructions are used.
+    LRUsed = true;
+
+  if (MF.getMMI().callsUnwindInit() || MF.getMMI().callsEHReturn()) {
+    // The unwinder expects to find spill slots for the exception info regs R0
+    // & R1. These are used during llvm.eh.return() to 'restore' the exception
+    // info. N.B. we do not spill or restore R0, R1 during normal operation.
+    XFI->createEHSpillSlot(MF);
+    // As we will  have a stack, we force the LR to be saved.
+    LRUsed = true;
+  }
+
   if (LRUsed) {
+    // We will handle the LR in the prologue/epilogue
+    // and allocate space on the stack ourselves.
     MF.getRegInfo().setPhysRegUnused(XCore::LR);
-
-    bool isVarArg = MF.getFunction()->isVarArg();
-    int FrameIdx;
-    if (! isVarArg) {
-      // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
-      FrameIdx = MFI->CreateFixedObject(RC->getSize(), 0, true);
-    } else {
-      FrameIdx = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(),
-                                        false);
-    }
-    XFI->setUsesLR(FrameIdx);
-    XFI->setLRSpillSlot(FrameIdx);
+    XFI->createLRSpillSlot(MF);
   }
-  if (RegInfo->requiresRegisterScavenging(MF)) {
-    // Reserve a slot close to SP or frame pointer.
+
+  if (hasFP(MF))
+    // A callee save register is used to hold the FP.
+    // This needs saving / restoring in the epilogue / prologue.
+    XFI->createFPSpillSlot(MF);
+}
+
+void XCoreFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                    RegScavenger *RS) const {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  // Reserve slots close to SP or frame pointer for Scavenging spills.
+  // When using SP for small frames, we don't need any scratch registers.
+  // When using SP for large frames, we may need 2 scratch registers.
+  // When using FP, for large or small frames, we may need 1 scratch register.
+  if (XFI->isLargeFrame(MF) || hasFP(MF))
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  if (XFI->isLargeFrame(MF) && !hasFP(MF))
     RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                        RC->getAlignment(),
                                                        false));
-  }
-  if (hasFP(MF)) {
-    // A callee save register is used to hold the FP.
-    // This needs saving / restoring in the epilogue / prologue.
-    XFI->setFPSpillSlot(MFI->CreateStackObject(RC->getSize(),
-                                               RC->getAlignment(),
-                                               false));
-  }
 }
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index ebad62f..6cd90c9 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -48,6 +48,9 @@ namespace llvm {
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS = NULL) const;
 
+    void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                             RegScavenger *RS = NULL) const;
+
     //! Stack slot size (4 bytes)
     static int stackSlotSize() {
       return 4;
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
new file mode 100644
index 0000000..c18eff9
--- /dev/null
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -0,0 +1,62 @@
+//===-- XCoreFrameToArgsOffsetElim.cpp ----------------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo FRAME_TO_ARGS_OFFSET with the appropriate real offset.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  struct XCoreFTAOElim : public MachineFunctionPass {
+    static char ID;
+    XCoreFTAOElim() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "XCore FRAME_TO_ARGS_OFFSET Elimination";
+    }
+  };
+  char XCoreFTAOElim::ID = 0;
+}
+
+/// createXCoreFrameToArgsOffsetEliminationPass - returns an instance of the
+/// Frame to args offset elimination pass
+FunctionPass *llvm::createXCoreFrameToArgsOffsetEliminationPass() {
+  return new XCoreFTAOElim();
+}
+
+bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
+  const XCoreInstrInfo &TII =
+          *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  unsigned StackSize = MF.getFrameInfo()->getStackSize();
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (MachineBasicBlock::iterator MBBI = MBB.begin(), EE = MBB.end();
+         MBBI != EE; ++MBBI) {
+      if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
+        MachineInstr *OldInst = MBBI;
+        unsigned Reg = OldInst->getOperand(0).getReg();
+        MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize);
+        OldInst->eraseFromParent();
+      }
+    }
+  }
+  return true;
+}
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index e28f84f..5b0fcfa 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -66,7 +66,10 @@ namespace {
 
     // Complex Pattern Selectors.
     bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset);
-    
+
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
+
     virtual const char *getPassName() const {
       return "XCore DAG->DAG Pattern Instruction Selection";
     } 
@@ -106,6 +109,28 @@ bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool XCoreDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Reg;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm': // Memory.
+    switch (Op.getOpcode()) {
+    default: return true;
+    case XCoreISD::CPRelativeWrapper:
+      Reg = CurDAG->getRegister(XCore::CP, MVT::i32);
+      break;
+    case XCoreISD::DPRelativeWrapper:
+      Reg = CurDAG->getRegister(XCore::DP, MVT::i32);
+      break;
+    }
+  }
+  OutOps.push_back(Reg);
+  OutOps.push_back(Op.getOperand(0));
+  return false;
+}
+
 SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   switch (N->getOpcode()) {
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 89ad27d..1b74013 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -49,6 +50,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
     case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
     case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+    case XCoreISD::LDWSP             : return "XCoreISD::LDWSP";
     case XCoreISD::STWSP             : return "XCoreISD::STWSP";
     case XCoreISD::RETSP             : return "XCoreISD::RETSP";
     case XCoreISD::LADD              : return "XCoreISD::LADD";
@@ -59,6 +61,8 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::CRC8              : return "XCoreISD::CRC8";
     case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
     case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
+    case XCoreISD::EH_RETURN         : return "XCoreISD::EH_RETURN";
     case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
     default                          : return NULL;
   }
@@ -150,11 +154,18 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // Exception handling
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   setExceptionPointerRegister(XCore::R0);
   setExceptionSelectorRegister(XCore::R1);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 
   // Atomic operations
+  // We request a fence for ATOMIC_* instructions, to reduce them to Monotonic.
+  // As we are always Sequential Consistent, an ATOMIC_FENCE becomes a no OP.
+  setInsertFencesForAtomic(true);
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
 
   // TRAMPOLINE is custom lowered.
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
@@ -170,8 +181,11 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
 }
 
 bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -196,6 +210,7 @@ SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode())
   {
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
@@ -211,10 +226,14 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADD:
   case ISD::SUB:                return ExpandADDSUB(Op.getNode(), DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+  case ISD::ATOMIC_LOAD:        return LowerATOMIC_LOAD(Op, DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -258,32 +277,59 @@ getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
   const GlobalValue *UnderlyingGV = GV;
   // If GV is an alias then use the aliasee to determine the wrapper type
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    UnderlyingGV = GA->resolveAliasedGlobal();
+    UnderlyingGV = GA->getAliasedGlobal();
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) {
-    if (GVar->isConstant())
+    if (  ( GVar->isConstant() &&
+            UnderlyingGV->isLocalLinkage(GV->getLinkage()) )
+       || ( GVar->hasSection() &&
+            StringRef(GVar->getSection()).startswith(".cp.") ) )
       return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
     return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
   }
   return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
 }
 
+static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) {
+  if (XTL.getTargetMachine().getCodeModel() == CodeModel::Small)
+    return true;
+
+  Type *ObjType = GV->getType()->getPointerElementType();
+  if (!ObjType->isSized())
+    return false;
+
+  unsigned ObjSize = XTL.getDataLayout()->getTypeAllocSize(ObjType);
+  return ObjSize < CodeModelLargeSize && ObjSize != 0;
+}
+
 SDValue XCoreTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  SDLoc DL(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
+  SDLoc DL(GN);
   int64_t Offset = GN->getOffset();
-  // We can only fold positive offsets that are a multiple of the word size.
-  int64_t FoldedOffset = std::max(Offset & ~3, (int64_t)0);
-  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, FoldedOffset);
-  GA = getGlobalAddressWrapper(GA, GV, DAG);
-  // Handle the rest of the offset.
-  if (Offset != FoldedOffset) {
-    SDValue Remaining = DAG.getConstant(Offset - FoldedOffset, MVT::i32);
-    GA = DAG.getNode(ISD::ADD, DL, MVT::i32, GA, Remaining);
+  if (IsSmallObject(GV, *this)) {
+    // We can only fold positive offsets that are a multiple of the word size.
+    int64_t FoldedOffset = std::max(Offset & ~3, (int64_t)0);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, FoldedOffset);
+    GA = getGlobalAddressWrapper(GA, GV, DAG);
+    // Handle the rest of the offset.
+    if (Offset != FoldedOffset) {
+      SDValue Remaining = DAG.getConstant(Offset - FoldedOffset, MVT::i32);
+      GA = DAG.getNode(ISD::ADD, DL, MVT::i32, GA, Remaining);
+    }
+    return GA;
+  } else {
+    // Ideally we would not fold in offset with an index <= 11.
+    Type *Ty = Type::getInt8PtrTy(*DAG.getContext());
+    Constant *GA = ConstantExpr::getBitCast(const_cast<GlobalValue*>(GV), Ty);
+    Ty = Type::getInt32Ty(*DAG.getContext());
+    Constant *Idx = ConstantInt::get(Ty, Offset);
+    Constant *GAI = ConstantExpr::getGetElementPtr(GA, Idx);
+    SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
+    return DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), CP,
+                       MachinePointerInfo(), false, false, false, 0);
   }
-  return GA;
 }
 
 SDValue XCoreTargetLowering::
@@ -307,10 +353,10 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   SDValue Res;
   if (CP->isMachineConstantPoolEntry()) {
     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
-                                    CP->getAlignment());
+                                    CP->getAlignment(), CP->getOffset());
   } else {
     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
-                                    CP->getAlignment());
+                                    CP->getAlignment(), CP->getOffset());
   }
   return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
 }
@@ -767,15 +813,85 @@ LowerVASTART(SDValue Op, SelectionDAG &DAG) const
 
 SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
                                             SelectionDAG &DAG) const {
-  SDLoc dl(Op);
+  // This nodes represent llvm.frameaddress on the DAG.
+  // It takes one operand, the index of the frame address to return.
+  // An index of zero corresponds to the current function's frame address.
+  // An index of one to the parent's frame address, and so on.
   // Depths > 0 not supported yet!
   if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
-  return DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op),
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  // This nodes represent llvm.returnaddress on the DAG.
+  // It takes one operand, the index of the return address to return.
+  // An index of zero corresponds to the current function's return address.
+  // An index of one to the parent's return address, and so on.
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  int FI = XFI->createLRSpillSlot(MF);
+  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+  return DAG.getLoad(getPointerTy(), SDLoc(Op), DAG.getEntryNode(), FIN,
+                     MachinePointerInfo::getFixedStack(FI), false, false,
+                     false, 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const {
+  // This node represents offset from frame pointer to first on-stack argument.
+  // This is needed for correct stack adjustment during unwind.
+  // However, we don't know the offset until after the frame has be finalised.
+  // This is done during the XCoreFTAOElim pass.
+  return DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, SDLoc(Op), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  // OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER)
+  // This node represents 'eh_return' gcc dwarf builtin, which is used to
+  // return from exception. The general meaning is: adjust stack by OFFSET and
+  // pass execution to HANDLER.
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Absolute SP = (FP + FrameToArgs) + Offset
+  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  SDValue Stack = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                             RegInfo->getFrameRegister(MF), MVT::i32);
+  SDValue FrameToArgs = DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, dl,
+                                    MVT::i32);
+  Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, FrameToArgs);
+  Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, Offset);
+
+  // R0=ExceptionPointerRegister R1=ExceptionSelectorRegister
+  // which leaves 2 caller saved registers, R2 & R3 for us to use.
+  unsigned StackReg = XCore::R2;
+  unsigned HandlerReg = XCore::R3;
+
+  SDValue OutChains[] = {
+    DAG.getCopyToReg(Chain, dl, StackReg, Stack),
+    DAG.getCopyToReg(Chain, dl, HandlerReg, Handler)
+  };
+
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 2);
+
+  return DAG.getNode(XCoreISD::EH_RETURN, dl, MVT::Other, Chain,
+                     DAG.getRegister(StackReg, MVT::i32),
+                     DAG.getRegister(HandlerReg, MVT::i32));
+
 }
 
 SDValue XCoreTargetLowering::
@@ -862,6 +978,67 @@ LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
+SDValue XCoreTargetLowering::
+LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
+  assert(N->getOrdering() <= Monotonic &&
+         "setInsertFencesForAtomic(true) and yet greater than Monotonic");
+  if (N->getMemoryVT() == MVT::i32) {
+    if (N->getAlignment() < 4)
+      report_fatal_error("atomic load must be aligned");
+    return DAG.getLoad(getPointerTy(), SDLoc(Op), N->getChain(),
+                       N->getBasePtr(), N->getPointerInfo(),
+                       N->isVolatile(), N->isNonTemporal(),
+                       N->isInvariant(), N->getAlignment(),
+                       N->getTBAAInfo(), N->getRanges());
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    if (N->getAlignment() < 2)
+      report_fatal_error("atomic load must be aligned");
+    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+                          N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+                          N->isVolatile(), N->isNonTemporal(),
+                          N->getAlignment(), N->getTBAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i8)
+    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+                          N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+                          N->isVolatile(), N->isNonTemporal(),
+                          N->getAlignment(), N->getTBAAInfo());
+  return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
+  assert(N->getOrdering() <= Monotonic &&
+         "setInsertFencesForAtomic(true) and yet greater than Monotonic");
+  if (N->getMemoryVT() == MVT::i32) {
+    if (N->getAlignment() < 4)
+      report_fatal_error("atomic store must be aligned");
+    return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(),
+                        N->getBasePtr(), N->getPointerInfo(),
+                        N->isVolatile(), N->isNonTemporal(),
+                        N->getAlignment(), N->getTBAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    if (N->getAlignment() < 2)
+      report_fatal_error("atomic store must be aligned");
+    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+                             N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+                             N->isVolatile(), N->isNonTemporal(),
+                             N->getAlignment(), N->getTBAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i8)
+    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+                             N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+                             N->isVolatile(), N->isNonTemporal(),
+                             N->getAlignment(), N->getTBAAInfo());
+  return SDValue();
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -902,6 +1079,52 @@ XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 }
 
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers / memory locations.
+static SDValue
+LowerCallResult(SDValue Chain, SDValue InFlag,
+                const SmallVectorImpl<CCValAssign> &RVLocs,
+                SDLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) {
+  SmallVector<std::pair<int, unsigned>, 4> ResultMemLocs;
+  // Copy results out of physical registers.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = RVLocs[i];
+    if (VA.isRegLoc()) {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getValVT(),
+                                 InFlag).getValue(1);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Chain.getValue(0));
+    } else {
+      assert(VA.isMemLoc());
+      ResultMemLocs.push_back(std::make_pair(VA.getLocMemOffset(),
+                                             InVals.size()));
+      // Reserve space for this result.
+      InVals.push_back(SDValue());
+    }
+  }
+
+  // Copy results out of memory.
+  SmallVector<SDValue, 4> MemOpChains;
+  for (unsigned i = 0, e = ResultMemLocs.size(); i != e; ++i) {
+    int offset = ResultMemLocs[i].first;
+    unsigned index = ResultMemLocs[i].second;
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+    SDValue Ops[] = { Chain, DAG.getConstant(offset / 4, MVT::i32) };
+    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops, 2);
+    InVals[index] = load;
+    MemOpChains.push_back(load.getValue(1));
+  }
+
+  // Transform all loads nodes into one single node because
+  // all load nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  return Chain;
+}
+
 /// LowerCCCCallTo - functions arguments are copied from virtual
 /// regs to (physical regs)/(stack frame), CALLSEQ_START and
 /// CALLSEQ_END are emitted.
@@ -927,8 +1150,15 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
 
+  SmallVector<CCValAssign, 16> RVLocs;
+  // Analyze return values to determine the number of bytes of stack required.
+  CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext());
+  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
+
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
+  unsigned NumBytes = RetCCInfo.getNextStackOffset();
 
   Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes,
                                  getPointerTy(), true), dl);
@@ -1026,35 +1256,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
-                         Ins, dl, DAG, InVals);
-}
-
-/// LowerCallResult - Lower the result values of a call into the
-/// appropriate copies out of appropriate physical registers.
-SDValue
-XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                     CallingConv::ID CallConv, bool isVarArg,
-                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     SDLoc dl, SelectionDAG &DAG,
-                                     SmallVectorImpl<SDValue> &InVals) const {
-
-  // Assign locations to each value returned by this call.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
-
-  // Copy all of the result registers out of their specified physreg.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                 RVLocs[i].getValVT(), InFlag).getValue(1);
-    InFlag = Chain.getValue(2);
-    InVals.push_back(Chain.getValue(0));
-  }
-
-  return Chain;
+  return LowerCallResult(Chain, InFlag, RVLocs, dl, DAG, InVals);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1102,6 +1304,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -1114,6 +1317,9 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   unsigned LRSaveSize = StackSlotSize;
 
+  if (!isVarArg)
+    XFI->setReturnStackOffset(CCInfo.getNextStackOffset() + LRSaveSize);
+
   // All getCopyFromReg ops must precede any getMemcpys to prevent the
   // scheduler clobbering a register before it has been copied.
   // The stages are:
@@ -1230,7 +1436,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       unsigned Size = ArgDI->Flags.getByValSize();
       unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
       // Create a new object on the stack and copy the pointee into it.
-      int FI = MFI->CreateStackObject(Size, Align, false, false);
+      int FI = MFI->CreateStackObject(Size, Align, false);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
       InVals.push_back(FIN);
       MemOps.push_back(DAG.getMemcpy(Chain, dl, FIN, ArgDI->SDV,
@@ -1264,7 +1470,11 @@ CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC_XCore);
+  if (!CCInfo.CheckReturn(Outs, RetCC_XCore))
+    return false;
+  if (CCInfo.getNextStackOffset() != 0 && isVarArg)
+    return false;
+  return true;
 }
 
 SDValue
@@ -1274,6 +1484,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  SDLoc dl, SelectionDAG &DAG) const {
 
+  XCoreFunctionInfo *XFI =
+    DAG.getMachineFunction().getInfo<XCoreFunctionInfo>();
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1283,6 +1497,9 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analyze return values.
+  if (!isVarArg)
+    CCInfo.AllocateStack(XFI->getReturnStackOffset(), 4);
+
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
   SDValue Flag;
@@ -1291,13 +1508,43 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   // Return on XCore is always a "retsp 0"
   RetOps.push_back(DAG.getConstant(0, MVT::i32));
 
-  // Copy the result values into the output registers.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+  SmallVector<SDValue, 4> MemOpChains;
+  // Handle return values that must be copied to memory.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Can only return in registers!");
+    if (VA.isRegLoc())
+      continue;
+    assert(VA.isMemLoc());
+    if (isVarArg) {
+      report_fatal_error("Can't return value from vararg function in memory");
+    }
+
+    int Offset = VA.getLocMemOffset();
+    unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+    // Create the frame index object for the memory location.
+    int FI = MFI->CreateFixedObject(ObjSize, Offset, false);
+
+    // Create a SelectionDAG node corresponding to a store
+    // to this memory location.
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    MemOpChains.push_back(DAG.getStore(Chain, dl, OutVals[i], FIN,
+                          MachinePointerInfo::getFixedStack(FI), false, false,
+                          0));
+  }
+
+  // Transform all store nodes into one single node because
+  // all stores are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
 
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             OutVals[i], Flag);
+  // Now handle return values copied to registers.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    if (!VA.isRegLoc())
+      continue;
+    // Copy the result values into the output registers.
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
 
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
@@ -1350,8 +1597,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Next, add the true and fallthrough blocks as its successors.
@@ -1392,6 +1638,46 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
+  case ISD::INTRINSIC_VOID:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::xcore_outt:
+    case Intrinsic::xcore_outct:
+    case Intrinsic::xcore_chkct: {
+      SDValue OutVal = N->getOperand(3);
+      // These instructions ignore the high bits.
+      if (OutVal.hasOneUse()) {
+        unsigned BitWidth = OutVal.getValueSizeInBits();
+        APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+        APInt KnownZero, KnownOne;
+        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                              !DCI.isBeforeLegalizeOps());
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+            TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
+                                     TLO))
+          DCI.CommitTargetLoweringOpt(TLO);
+      }
+      break;
+    }
+    case Intrinsic::xcore_setpt: {
+      SDValue Time = N->getOperand(3);
+      // This instruction ignores the high bits.
+      if (Time.hasOneUse()) {
+        unsigned BitWidth = Time.getValueSizeInBits();
+        APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+        APInt KnownZero, KnownOne;
+        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                              !DCI.isBeforeLegalizeOps());
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+            TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
+                                     TLO))
+          DCI.CommitTargetLoweringOpt(TLO);
+      }
+      break;
+    }
+    }
+    break;
   case XCoreISD::LADD: {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
@@ -1596,6 +1882,34 @@ void XCoreTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                         KnownZero.getBitWidth() - 1);
     }
     break;
+  case ISD::INTRINSIC_W_CHAIN:
+    {
+      unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      case Intrinsic::xcore_getts:
+        // High bits are known to be zero.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 16);
+        break;
+      case Intrinsic::xcore_int:
+      case Intrinsic::xcore_inct:
+        // High bits are known to be zero.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 8);
+        break;
+      case Intrinsic::xcore_testct:
+        // Result is either 0 or 1.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 1);
+        break;
+      case Intrinsic::xcore_testwct:
+        // Result is in the range 0 - 4.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 3);
+        break;
+      }
+    }
+    break;
   }
 }
 
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index bc08497..65e2bad 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -42,6 +42,9 @@ namespace llvm {
       // cp relative address
       CPRelativeWrapper,
 
+      // Load word from stack
+      LDWSP,
+
       // Store word to stack
       STWSP,
 
@@ -72,6 +75,13 @@ namespace llvm {
       // Jumptable branch using long branches for each entry.
       BR_JT32,
 
+      // Offset from frame pointer to the first (possible) on-stack argument
+      FRAME_TO_ARGS_OFFSET,
+
+      // Exception handler return. The stack is restored to the first
+      // followed by a jump to the second argument.
+      EH_RETURN,
+
       // Memory barrier.
       MEMBARRIER
     };
@@ -132,11 +142,6 @@ namespace llvm {
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
-    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                            CallingConv::ID CallConv, bool isVarArg,
-                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals) const;
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
     SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
                                     SelectionDAG &DAG) const;
@@ -147,6 +152,7 @@ namespace llvm {
     // Lower Operand specifics
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -158,10 +164,14 @@ namespace llvm {
     SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index 33c7f31..cea3bbf 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -15,8 +15,12 @@
 #include "XCore.h"
 #include "XCoreMachineFunctionInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -371,10 +375,18 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
+                             MachineMemOperand::MOStore,
+                             MFI.getObjectSize(FrameIndex),
+                             MFI.getObjectAlignment(FrameIndex));
   BuildMI(MBB, I, DL, get(XCore::STWFI))
     .addReg(SrcReg, getKillRegState(isKill))
     .addFrameIndex(FrameIndex)
-    .addImm(0);
+    .addImm(0)
+    .addMemOperand(MMO);
 }
 
 void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -385,9 +397,17 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex),
+                             MachineMemOperand::MOLoad,
+                             MFI.getObjectSize(FrameIndex),
+                             MFI.getObjectAlignment(FrameIndex));
   BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
     .addFrameIndex(FrameIndex)
-    .addImm(0);
+    .addImm(0)
+    .addMemOperand(MMO);
 }
 
 /// ReverseBranchCondition - Return the inverse opcode of the 
@@ -399,3 +419,33 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
   return false;
 }
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
+                                              MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator MI,
+                                              unsigned Reg, uint64_t Value) const {
+  DebugLoc dl;
+  if (MI != MBB.end()) dl = MI->getDebugLoc();
+  if (isMask_32(Value)) {
+    int N = Log2_32(Value) + 1;
+    return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N);
+  }
+  if (isImmU16(Value)) {
+    int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+    return BuildMI(MBB, MI, dl, get(Opcode), Reg).addImm(Value);
+  }
+  MachineConstantPool *ConstantPool = MBB.getParent()->getConstantPool();
+  const Constant *C = ConstantInt::get(
+        Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Value);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  return BuildMI(MBB, MI, dl, get(XCore::LDWCP_lru6), Reg)
+            .addConstantPoolIndex(Idx);
+}
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 4429b07..48c9cb5 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -81,6 +81,12 @@ public:
 
   virtual bool ReverseBranchCondition(
                             SmallVectorImpl<MachineOperand> &Cond) const;
+
+  // Emit code before MBBI to load immediate value into physical register Reg.
+  // Returns an iterator to the new instruction.
+  MachineBasicBlock::iterator loadImmediate(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MI,
+                                            unsigned Reg, uint64_t Value) const;
 };
 
 }
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 934a707..00cb705 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -35,6 +35,11 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
 def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind,
                       [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad, SDNPVariadic]>;
 
+def SDT_XCoreEhRet : SDTypeProfile<0, 2,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def XCoreEhRet       : SDNode<"XCoreISD::EH_RETURN", SDT_XCoreEhRet,
+                         [SDNPHasChain, SDNPOptInGlue]>;
+
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 
@@ -56,10 +61,17 @@ def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
 def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
                            []>;
 
+def frametoargsoffset : SDNode<"XCoreISD::FRAME_TO_ARGS_OFFSET", SDTIntLeaf,
+                               []>;
+
 def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
 def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
                                [SDNPHasChain, SDNPMayStore]>;
 
+def SDT_XCoreLdwsp    : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
+                               [SDNPHasChain, SDNPMayLoad]>;
+
 // These are target-independent nodes, but have target-specific formats.
 def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
 def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -326,6 +338,16 @@ def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
+let isReMaterializable = 1 in
+def FRAME_TO_ARGS_OFFSET : PseudoInstXCore<(outs GRRegs:$dst), (ins),
+                               "# FRAME_TO_ARGS_OFFSET $dst",
+                               [(set GRRegs:$dst, (frametoargsoffset))]>;
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+def EH_RETURN : PseudoInstXCore<(outs), (ins GRRegs:$s, GRRegs:$handler),
+                               "# EH_RETURN $s, $handler",
+                               [(XCoreEhRet GRRegs:$s, GRRegs:$handler)]>;
+
 def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
                              "# LDWFI $dst, $addr",
                              [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
@@ -563,10 +585,12 @@ def STWSP_lru6 : _FLRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
 
 let mayLoad=1 in {
 def LDWSP_ru6 : _FRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
-                      "ldw $a, sp[$b]", []>;
+                      "ldw $a, sp[$b]",
+                      [(set RRegs:$a, (XCoreLdwsp immU6:$b))]>;
 
 def LDWSP_lru6 : _FLRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
-                        "ldw $a, sp[$b]", []>;
+                        "ldw $a, sp[$b]",
+                        [(set RRegs:$a, (XCoreLdwsp immU16:$b))]>;
 }
 
 let neverHasSideEffects = 1 in {
@@ -694,10 +718,10 @@ def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
 def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
 
 def BLRF_u10 : _FU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
-                     [(XCoreBranchLink immU10:$a)]>;
+                     []>;
 
 def BLRF_lu10 : _FLU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
-                       [(XCoreBranchLink immU20:$a)]>;
+                       [(XCoreBranchLink tglobaladdr:$a)]>;
 
 def BLRB_u10 : _FU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
 
@@ -995,7 +1019,8 @@ def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
 
 def DGETREG_1r : _F1R<0b001110, (outs GRRegs:$a), (ins), "dgetreg $a", []>;
 
-def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]", []>;
+def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]",
+                  [(int_xcore_edu GRRegs:$a)]>;
 
 def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
                "eeu res[$a]",
@@ -1009,7 +1034,8 @@ def WAITET_1R : _F1R<0b000010, (outs), (ins GRRegs:$a), "waitet $a", []>;
 
 def TSTART_1R : _F1R<0b000110, (outs), (ins GRRegs:$a), "start t[$a]", []>;
 
-def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]", []>;
+def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]",
+                    [(int_xcore_clrpt GRRegs:$a)]>;
 
 // Zero operand short
 
@@ -1087,7 +1113,6 @@ def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
 
-def : Pat<(XCoreBranchLink tglobaladdr:$addr), (BLRF_lu10 tglobaladdr:$addr)>;
 def : Pat<(XCoreBranchLink texternalsym:$addr), (BLRF_lu10 texternalsym:$addr)>;
 
 /// sext_inreg
@@ -1286,3 +1311,9 @@ def : Pat<(setgt GRRegs:$lhs, -1),
 
 def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
           (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;
+
+def : Pat<(load (cprelwrapper tconstpool:$b)),
+          (LDWCP_lru6 tconstpool:$b)>;
+
+def : Pat<(cprelwrapper tconstpool:$b),
+          (LDAWCP_lu6 tconstpool:$b)>;
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index afce753..b398c2d 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -17,13 +17,13 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/NoFolder.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "xcore-lower-thread-local"
@@ -127,10 +127,7 @@ createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
 
 static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
   do {
-    SmallVector<WeakVH,8> WUsers;
-    for (Value::use_iterator I = CE->use_begin(), E = CE->use_end();
-         I != E; ++I)
-      WUsers.push_back(WeakVH(*I));
+    SmallVector<WeakVH,8> WUsers(CE->user_begin(), CE->user_end());
     std::sort(WUsers.begin(), WUsers.end());
     WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
     while (!WUsers.empty())
@@ -154,17 +151,17 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
             return false;
         }
       }
-  } while (CE->hasNUsesOrMore(1)); // We need to check becasue a recursive
-  // sibbling may have used 'CE' when createReplacementInstr was called.
+  } while (CE->hasNUsesOrMore(1)); // We need to check because a recursive
+  // sibling may have used 'CE' when createReplacementInstr was called.
   CE->destroyConstant();
   return true;
 }
 
 static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
   SmallVector<WeakVH,8> WUsers;
-  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
-    if (!isa<Instruction>(*I))
-      WUsers.push_back(WeakVH(*I));
+  for (User *U : GV->users())
+    if (!isa<Instruction>(U))
+      WUsers.push_back(WeakVH(U));
   while (!WUsers.empty())
     if (WeakVH WU = WUsers.pop_back_val()) {
       ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
@@ -203,7 +200,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
                        GV->isExternallyInitialized());
 
   // Update uses.
-  SmallVector<User *, 16> Users(GV->use_begin(), GV->use_end());
+  SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
   for (unsigned I = 0, E = Users.size(); I != E; ++I) {
     User *U = Users[I];
     Instruction *Inst = cast<Instruction>(U);
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index def2673..dfdadcf 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -17,10 +17,10 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index 7ca0672..9ef9752 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -8,7 +8,65 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreMachineFunctionInfo.h"
+#include "XCoreInstrInfo.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
 void XCoreFunctionInfo::anchor() { }
+
+bool XCoreFunctionInfo::isLargeFrame(const MachineFunction &MF) const {
+  if (CachedEStackSize == -1) {
+    CachedEStackSize = MF.getFrameInfo()->estimateStackSize(MF);
+  }
+  // isLargeFrame() is used when deciding if spill slots should be added to
+  // allow eliminateFrameIndex() to scavenge registers.
+  // This is only required when there is no FP and offsets are greater than
+  // ~256KB (~64Kwords). Thus only for code run on the emulator!
+  //
+  // The arbitrary value of 0xf000 allows frames of up to ~240KB before spill
+  // slots are added for the use of eliminateFrameIndex() register scavenging.
+  // For frames less than 240KB, it is assumed that there will be less than
+  // 16KB of function arguments.
+  return CachedEStackSize > 0xf000;
+}
+
+int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
+  if (LRSpillSlotSet) {
+    return LRSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (! MF.getFunction()->isVarArg()) {
+    // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+    LRSpillSlot = MFI->CreateFixedObject(RC->getSize(), 0, true);
+  } else {
+    LRSpillSlot = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  }
+  LRSpillSlotSet = true;
+  return LRSpillSlot;
+}
+
+int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
+  if (FPSpillSlotSet) {
+    return FPSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  FPSpillSlot = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  FPSpillSlotSet = true;
+  return FPSpillSlot;
+}
+
+const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
+  if (EHSpillSlotSet) {
+    return EHSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  EHSpillSlot[0] = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  EHSpillSlot[1] = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  EHSpillSlotSet = true;
+  return EHSpillSlot;
+}
+
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
index 69d5de3..212a5cf 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -27,40 +27,77 @@ class Function;
 /// XCore target-specific information for each MachineFunction.
 class XCoreFunctionInfo : public MachineFunctionInfo {
   virtual void anchor();
-  bool UsesLR;
+  bool LRSpillSlotSet;
   int LRSpillSlot;
+  bool FPSpillSlotSet;
   int FPSpillSlot;
+  bool EHSpillSlotSet;
+  int EHSpillSlot[2];
+  unsigned ReturnStackOffset;
+  bool ReturnStackOffsetSet;
   int VarArgsFrameIndex;
-  std::vector<std::pair<MCSymbol*, CalleeSavedInfo> > SpillLabels;
+  mutable int CachedEStackSize;
+  std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>>
+  SpillLabels;
 
 public:
   XCoreFunctionInfo() :
-    UsesLR(false),
-    LRSpillSlot(0),
-    FPSpillSlot(0),
-    VarArgsFrameIndex(0) {}
+    LRSpillSlotSet(false),
+    FPSpillSlotSet(false),
+    EHSpillSlotSet(false),
+    ReturnStackOffsetSet(false),
+    VarArgsFrameIndex(0),
+    CachedEStackSize(-1) {}
   
   explicit XCoreFunctionInfo(MachineFunction &MF) :
-    UsesLR(false),
-    LRSpillSlot(0),
-    FPSpillSlot(0),
-    VarArgsFrameIndex(0) {}
+    LRSpillSlotSet(false),
+    FPSpillSlotSet(false),
+    EHSpillSlotSet(false),
+    ReturnStackOffsetSet(false),
+    VarArgsFrameIndex(0),
+    CachedEStackSize(-1) {}
   
   ~XCoreFunctionInfo() {}
   
   void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
-  
-  void setUsesLR(bool val) { UsesLR = val; }
-  bool getUsesLR() const { return UsesLR; }
-  
-  void setLRSpillSlot(int off) { LRSpillSlot = off; }
-  int getLRSpillSlot() const { return LRSpillSlot; }
-  
-  void setFPSpillSlot(int off) { FPSpillSlot = off; }
-  int getFPSpillSlot() const { return FPSpillSlot; }
-  
-  std::vector<std::pair<MCSymbol*, CalleeSavedInfo> > &getSpillLabels() {
+
+  int createLRSpillSlot(MachineFunction &MF);
+  bool hasLRSpillSlot() { return LRSpillSlotSet; }
+  int getLRSpillSlot() const {
+    assert(LRSpillSlotSet && "LR Spill slot not set");
+    return LRSpillSlot;
+  }
+
+  int createFPSpillSlot(MachineFunction &MF);
+  bool hasFPSpillSlot() { return FPSpillSlotSet; }
+  int getFPSpillSlot() const {
+    assert(FPSpillSlotSet && "FP Spill slot not set");
+    return FPSpillSlot;
+  }
+
+  const int* createEHSpillSlot(MachineFunction &MF);
+  bool hasEHSpillSlot() { return EHSpillSlotSet; }
+  const int* getEHSpillSlot() const {
+    assert(EHSpillSlotSet && "EH Spill slot not set");
+    return EHSpillSlot;
+  }
+
+  void setReturnStackOffset(unsigned value) {
+    assert(!ReturnStackOffsetSet && "Return stack offset set twice");
+    ReturnStackOffset = value;
+    ReturnStackOffsetSet = true;
+  }
+
+  unsigned getReturnStackOffset() const {
+    assert(ReturnStackOffsetSet && "Return stack offset not set");
+    return ReturnStackOffset;
+  }
+
+  bool isLargeFrame(const MachineFunction &MF) const;
+
+  std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>> &
+  getSpillLabels() {
     return SpillLabels;
   }
 };
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index dbd2f52..d85d717 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "XCoreRegisterInfo.h"
 #include "XCore.h"
+#include "XCoreInstrInfo.h"
 #include "XCoreMachineFunctionInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
@@ -26,9 +27,9 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
@@ -54,6 +55,151 @@ static inline bool isImmU16(unsigned val) {
   return val < (1 << 16);
 }
 
+
+static void InsertFPImmInst(MachineBasicBlock::iterator II,
+                            const XCoreInstrInfo &TII,
+                            unsigned Reg, unsigned FrameReg, int Offset ) {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(FrameReg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertFPConstInst(MachineBasicBlock::iterator II,
+                              const XCoreInstrInfo &TII,
+                              unsigned Reg, unsigned FrameReg,
+                              int Offset, RegScavenger *RS ) {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  RS->setUsed(ScratchOffset);
+  TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+  switch (MI.getOpcode()) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertSPImmInst(MachineBasicBlock::iterator II,
+                            const XCoreInstrInfo &TII,
+                            unsigned Reg, int Offset) {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  bool isU6 = isImmU6(Offset);
+
+  switch (MI.getOpcode()) {
+  int NewOpcode;
+  case XCore::LDWFI:
+    NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+          .addImm(Offset);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertSPConstInst(MachineBasicBlock::iterator II,
+                                const XCoreInstrInfo &TII,
+                                unsigned Reg, int Offset, RegScavenger *RS ) {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned OpCode = MI.getOpcode();
+
+  unsigned ScratchBase;
+  if (OpCode==XCore::STWFI) {
+    ScratchBase = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+    RS->setUsed(ScratchBase);
+  } else
+    ScratchBase = Reg;
+  BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0);
+  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  RS->setUsed(ScratchOffset);
+  TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+  switch (OpCode) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
 bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
   return MF.getMMI().hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
@@ -61,11 +207,21 @@ bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
 
 const uint16_t* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
                                                                          const {
+  // The callee saved registers LR & FP are explicitly handled during
+  // emitPrologue & emitEpilogue and related functions.
   static const uint16_t CalleeSavedRegs[] = {
     XCore::R4, XCore::R5, XCore::R6, XCore::R7,
-    XCore::R8, XCore::R9, XCore::R10, XCore::LR,
+    XCore::R8, XCore::R9, XCore::R10,
+    0
+  };
+  static const uint16_t CalleeSavedRegsFP[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9,
     0
   };
+  const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
+  if (TFI->hasFP(*MF))
+    return CalleeSavedRegsFP;
   return CalleeSavedRegs;
 }
 
@@ -85,15 +241,12 @@ BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 bool
 XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // TODO can we estimate stack size?
-  return TFI->hasFP(MF);
+  return true;
 }
 
 bool
 XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  return requiresRegisterScavenging(MF);
+  return true;
 }
 
 bool
@@ -107,12 +260,13 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                        RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
   MachineInstr &MI = *II;
-  DebugLoc dl = MI.getDebugLoc();
   MachineOperand &FrameOp = MI.getOperand(FIOperandNum);
   int FrameIndex = FrameOp.getIndex();
 
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const XCoreInstrInfo &TII =
+          *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   int StackSize = MF.getFrameInfo()->getStackSize();
@@ -143,116 +297,28 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
   
   assert(Offset%4 == 0 && "Misaligned stack offset");
-
   DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
-  
   Offset/=4;
   
-  bool FP = TFI->hasFP(MF);
-
   unsigned Reg = MI.getOperand(0).getReg();
-  bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill();
-
   assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
-  
-  MachineBasicBlock &MBB = *MI.getParent();
-  
-  if (FP) {
-    bool isUs = isImmUs(Offset);
-    
-    if (!isUs) {
-      if (!RS)
-        report_fatal_error("eliminateFrameIndex Frame size too big: " +
-                           Twine(Offset));
-      unsigned ScratchReg = RS->scavengeRegister(&XCore::GRRegsRegClass, II,
-                                                 SPAdj);
-      loadConstant(MBB, II, ScratchReg, Offset, dl);
-      switch (MI.getOpcode()) {
-      case XCore::LDWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
-              .addReg(FrameReg)
-              .addReg(ScratchReg, RegState::Kill);
-        break;
-      case XCore::STWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
-              .addReg(Reg, getKillRegState(isKill))
-              .addReg(FrameReg)
-              .addReg(ScratchReg, RegState::Kill);
-        break;
-      case XCore::LDAWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
-              .addReg(FrameReg)
-              .addReg(ScratchReg, RegState::Kill);
-        break;
-      default:
-        llvm_unreachable("Unexpected Opcode");
-      }
-    } else {
-      switch (MI.getOpcode()) {
-      case XCore::LDWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
-              .addReg(FrameReg)
-              .addImm(Offset);
-        break;
-      case XCore::STWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
-              .addReg(Reg, getKillRegState(isKill))
-              .addReg(FrameReg)
-              .addImm(Offset);
-        break;
-      case XCore::LDAWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
-              .addReg(FrameReg)
-              .addImm(Offset);
-        break;
-      default:
-        llvm_unreachable("Unexpected Opcode");
-      }
-    }
+
+  if (TFI->hasFP(MF)) {
+    if (isImmUs(Offset))
+      InsertFPImmInst(II, TII, Reg, FrameReg, Offset);
+    else
+      InsertFPConstInst(II, TII, Reg, FrameReg, Offset, RS);
   } else {
-    bool isU6 = isImmU6(Offset);
-    if (!isU6 && !isImmU16(Offset))
-      report_fatal_error("eliminateFrameIndex Frame size too big: " +
-                         Twine(Offset));
-
-    switch (MI.getOpcode()) {
-    int NewOpcode;
-    case XCore::LDWFI:
-      NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
-      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
-            .addImm(Offset);
-      break;
-    case XCore::STWFI:
-      NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
-      BuildMI(MBB, II, dl, TII.get(NewOpcode))
-            .addReg(Reg, getKillRegState(isKill))
-            .addImm(Offset);
-      break;
-    case XCore::LDAWFI:
-      NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
-      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
-            .addImm(Offset);
-      break;
-    default:
-      llvm_unreachable("Unexpected Opcode");
-    }
+    if (isImmU16(Offset))
+      InsertSPImmInst(II, TII, Reg, Offset);
+    else
+      InsertSPConstInst(II, TII, Reg, Offset, RS);
   }
   // Erase old instruction.
+  MachineBasicBlock &MBB = *MI.getParent();
   MBB.erase(II);
 }
 
-void XCoreRegisterInfo::
-loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-            unsigned DstReg, int64_t Value, DebugLoc dl) const {
-  // TODO use mkmsk if possible.
-  if (!isImmU16(Value)) {
-    // TODO use constant pool.
-    report_fatal_error("loadConstant value too big " + Twine(Value));
-  }
-  int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
-  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
-  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
-}
 
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 2370c62..36ba7b4 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -24,19 +24,6 @@ namespace llvm {
 class TargetInstrInfo;
 
 struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
-private:
-  void loadConstant(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  unsigned DstReg, int64_t Value, DebugLoc dl) const;
-
-  void storeToStack(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  unsigned SrcReg, int Offset, DebugLoc dl) const;
-
-  void loadFromStack(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator I,
-                  unsigned DstReg, int Offset, DebugLoc dl) const;
-
 public:
   XCoreRegisterInfo();
 
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 44aeb60..68ede6a 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -21,3 +21,36 @@ XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
 
 XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
 }
+
+SDValue XCoreSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool isVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const
+{
+  unsigned SizeBitWidth = Size.getValueType().getSizeInBits();
+  // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
+  if (!AlwaysInline && (Align & 3) == 0 &&
+      DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
+    const TargetLowering &TLI = *DAG.getTarget().getTargetLowering();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+    Entry.Node = Dst; Args.push_back(Entry);
+    Entry.Node = Src; Args.push_back(Entry);
+    Entry.Node = Size; Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo
+    CLI(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
+        0, TLI.getLibcallCallingConv(RTLIB::MEMCPY), /*isTailCall=*/false,
+        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
+        DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), Args, DAG, dl);
+    std::pair<SDValue,SDValue> CallResult =
+      TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+
+  // Otherwise have the target-independent code call memcpy.
+  return SDValue();
+}
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 0386968..31704f3 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -24,6 +24,15 @@ class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
   explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM);
   ~XCoreSelectionDAGInfo();
+
+  virtual SDValue
+  EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                          SDValue Chain,
+                          SDValue Op1, SDValue Op2,
+                          SDValue Op3, unsigned Align, bool isVolatile,
+                          bool AlwaysInline,
+                          MachinePointerInfo DstPtrInfo,
+                          MachinePointerInfo SrcPtrInfo) const;
 };
 
 }
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 9ae0b86..781a87b 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -27,8 +27,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
                                        CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
-    DL("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
-               "i16:16:32-i32:32:32-i64:32:32-n32"),
+    DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"),
     InstrInfo(),
     FrameLowering(Subtarget),
     TLInfo(*this),
@@ -49,6 +48,7 @@ public:
 
   virtual bool addPreISel();
   virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
 };
 } // namespace
 
@@ -66,6 +66,11 @@ bool XCorePassConfig::addInstSelector() {
   return false;
 }
 
+bool XCorePassConfig::addPreEmitPass() {
+  addPass(createXCoreFrameToArgsOffsetEliminationPass());
+  return false;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index 88e3bfd..ab0f7ad 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -9,27 +9,58 @@
 
 #include "XCoreTargetObjectFile.h"
 #include "XCoreSubtarget.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
 
 void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
 
+  BSSSection =
+    Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getBSS());
+  BSSSectionLarge =
+    Ctx.getELFSection(".dp.bss.large", ELF::SHT_NOBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getBSS());
   DataSection =
-    Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS, 
+    Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS,
                       ELF::SHF_ALLOC | ELF::SHF_WRITE |
                       ELF::XCORE_SHF_DP_SECTION,
                       SectionKind::getDataRel());
-  BSSSection =
-    Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+  DataSectionLarge =
+    Ctx.getELFSection(".dp.data.large", ELF::SHT_PROGBITS,
                       ELF::SHF_ALLOC | ELF::SHF_WRITE |
                       ELF::XCORE_SHF_DP_SECTION,
-                      SectionKind::getBSS());
-  
+                      SectionKind::getDataRel());
+  DataRelROSection =
+    Ctx.getELFSection(".dp.rodata", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  DataRelROSectionLarge =
+    Ctx.getELFSection(".dp.rodata.large", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  ReadOnlySection =
+    Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+  ReadOnlySectionLarge =
+    Ctx.getELFSection(".cp.rodata.large", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
   MergeableConst4Section = 
     Ctx.getELFSection(".cp.rodata.cst4", ELF::SHT_PROGBITS,
                       ELF::SHF_ALLOC | ELF::SHF_MERGE |
@@ -45,16 +76,103 @@ void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
                       ELF::SHF_ALLOC | ELF::SHF_MERGE |
                       ELF::XCORE_SHF_CP_SECTION,
                       SectionKind::getMergeableConst16());
-  
-  // TLS globals are lowered in the backend to arrays indexed by the current
-  // thread id. After lowering they require no special handling by the linker
-  // and can be placed in the standard data / bss sections.
-  TLSDataSection = DataSection;
-  TLSBSSSection = BSSSection;
-
-  ReadOnlySection = 
-    Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
-                      ELF::SHF_ALLOC |
+  CStringSection =
+    Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |
                       ELF::XCORE_SHF_CP_SECTION,
                       SectionKind::getReadOnlyWithRel());
+  // TextSection       - see MObjectFileInfo.cpp
+  // StaticCtorSection - see MObjectFileInfo.cpp
+  // StaticDtorSection - see MObjectFileInfo.cpp
+ }
+
+static unsigned getXCoreSectionType(SectionKind K) {
+  if (K.isBSS())
+    return ELF::SHT_NOBITS;
+  return ELF::SHT_PROGBITS;
+}
+
+static unsigned getXCoreSectionFlags(SectionKind K, bool IsCPRel) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= ELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= ELF::SHF_EXECINSTR;
+  else if (IsCPRel)
+    Flags |= ELF::XCORE_SHF_CP_SECTION;
+  else
+    Flags |= ELF::XCORE_SHF_DP_SECTION;
+
+  if (K.isWriteable())
+    Flags |= ELF::SHF_WRITE;
+
+  if (K.isMergeableCString() || K.isMergeableConst4() ||
+      K.isMergeableConst8() || K.isMergeableConst16())
+    Flags |= ELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= ELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+const MCSection *
+XCoreTargetObjectFile::getExplicitSectionGlobal(const GlobalValue *GV,
+                                                SectionKind Kind, Mangler &Mang,
+                                                const TargetMachine &TM) const {
+  StringRef SectionName = GV->getSection();
+  // Infer section flags from the section name if we can.
+  bool IsCPRel = SectionName.startswith(".cp.");
+  if (IsCPRel && !Kind.isReadOnly())
+    report_fatal_error("Using .cp. section for writeable object.");
+  return getContext().getELFSection(SectionName, getXCoreSectionType(Kind),
+                                    getXCoreSectionFlags(Kind, IsCPRel), Kind);
+}
+
+const MCSection *XCoreTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+                       const TargetMachine &TM) const{
+
+  bool UseCPRel = GV->isLocalLinkage(GV->getLinkage());
+
+  if (Kind.isText())                    return TextSection;
+  if (UseCPRel) {
+    if (Kind.isMergeable1ByteCString()) return CStringSection;
+    if (Kind.isMergeableConst4())       return MergeableConst4Section;
+    if (Kind.isMergeableConst8())       return MergeableConst8Section;
+    if (Kind.isMergeableConst16())      return MergeableConst16Section;
+  }
+  Type *ObjType = GV->getType()->getPointerElementType();
+  if (TM.getCodeModel() == CodeModel::Small ||
+      !ObjType->isSized() ||
+      TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+    if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
+                                                       : DataRelROSection;
+    if (Kind.isBSS() || Kind.isCommon())return BSSSection;
+    if (Kind.isDataRel())               return DataSection;
+    if (Kind.isReadOnlyWithRel())       return DataRelROSection;
+  } else {
+    if (Kind.isReadOnly())              return UseCPRel? ReadOnlySectionLarge
+                                                       : DataRelROSectionLarge;
+    if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge;
+    if (Kind.isDataRel())               return DataSectionLarge;
+    if (Kind.isReadOnlyWithRel())       return DataRelROSectionLarge;
+  }
+
+  assert((Kind.isThreadLocal() || Kind.isCommon()) && "Unknown section kind");
+  report_fatal_error("Target does not support TLS or Common sections");
+}
+
+const MCSection *XCoreTargetObjectFile::
+getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isMergeableConst4())           return MergeableConst4Section;
+  if (Kind.isMergeableConst8())           return MergeableConst8Section;
+  if (Kind.isMergeableConst16())          return MergeableConst16Section;
+  assert((Kind.isReadOnly() || Kind.isReadOnlyWithRel()) &&
+         "Unknown section kind");
+  // We assume the size of the object is never greater than CodeModelLargeSize.
+  // To handle CodeModelLargeSize changes to AsmPrinter would be required.
+  return ReadOnlySection;
 }
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 27875e7..733e6d3 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -14,11 +14,27 @@
 
 namespace llvm {
 
+static const unsigned CodeModelLargeSize = 256;
+
   class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+   const MCSection *BSSSectionLarge;
+   const MCSection *DataSectionLarge;
+   const MCSection *ReadOnlySectionLarge;
+   const MCSection *DataRelROSectionLarge;
   public:
     void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
-    // TODO: Classify globals as xcore wishes.
+    const MCSection *
+      getExplicitSectionGlobal(const GlobalValue *GV,
+                               SectionKind Kind, Mangler &Mang,
+                               const TargetMachine &TM) const override;
+
+    const MCSection *
+      SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                             Mangler &Mang,
+                             const TargetMachine &TM) const override;
+
+    const MCSection *getSectionForConstant(SectionKind Kind) const override;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/XCore/XCoreTargetStreamer.h b/lib/Target/XCore/XCoreTargetStreamer.h
new file mode 100644
index 0000000..0a394da
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- XCoreTargetStreamer.h - XCore Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETSTREAMER_H
+#define XCORETARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class XCoreTargetStreamer : public MCTargetStreamer {
+public:
+  XCoreTargetStreamer(MCStreamer &S);
+  virtual ~XCoreTargetStreamer();
+  virtual void emitCCTopData(StringRef Name) = 0;
+  virtual void emitCCTopFunction(StringRef Name) = 0;
+  virtual void emitCCBottomData(StringRef Name) = 0;
+  virtual void emitCCBottomFunction(StringRef Name) = 0;
+};
+}
+
+#endif
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
index cc165f7..313d18d 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.cpp
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -18,8 +18,8 @@
 #include "XCore.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -31,7 +31,7 @@ void initializeXCoreTTIPass(PassRegistry &);
 
 namespace {
 
-class XCoreTTI : public ImmutablePass, public TargetTransformInfo {
+class XCoreTTI final : public ImmutablePass, public TargetTransformInfo {
 public:
   XCoreTTI() : ImmutablePass(ID) {
     llvm_unreachable("This pass cannot be directly constructed");
@@ -42,27 +42,23 @@ public:
     initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() {
+  virtual void initializePass() override {
     pushTTIStack(this);
   }
 
-  virtual void finalizePass() {
-    popTTIStack();
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
   static char ID;
 
-  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;
   }
 
-  unsigned getNumberOfRegisters(bool Vector) const {
+  unsigned getNumberOfRegisters(bool Vector) const override {
     if (Vector) {
        return 0;
     }
author	Stephen Hines <srhines@google.com>	2014-04-23 16:57:46 -0700
committer	Stephen Hines <srhines@google.com>	2014-04-24 15:53:16 -0700
commit	36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree	e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /lib/Target
parent	69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download	external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2